diff --color -rcNP Master/arch/arm/include/asm/elf.h OG/arch/arm/include/asm/elf.h
*** Master/arch/arm/include/asm/elf.h	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/arm/include/asm/elf.h	2021-04-20 15:11:34.498000000 -0400
***************
*** 121,128 ****
  /* This is the base location for PIE (ET_DYN with INTERP) loads. */
  #define ELF_ET_DYN_BASE		0x400000UL
  
! /* When the program starts, a1 contains a pointer to a function to be 
!    registered with atexit, as per the SVR4 ABI.  A value of 0 means we 
     have no such handler.  */
  #define ELF_PLAT_INIT(_r, load_addr)	(_r)->ARM_r0 = 0
  
--- 121,135 ----
  /* This is the base location for PIE (ET_DYN with INTERP) loads. */
  #define ELF_ET_DYN_BASE		0x400000UL
  
! #ifdef CONFIG_MINISEC_ASLR
! #define PAX_ELF_ET_DYN_BASE	0x00008000UL
! 
! #define PAX_DELTA_MMAP_LEN	((current->personality == PER_LINUX_32BIT) ? 16 : 10)
! #define PAX_DELTA_STACK_LEN	((current->personality == PER_LINUX_32BIT) ? 16 : 10)
! #endif
! 
! /* When the program starts, a1 contains a pointer to a function to be
!    registered with atexit, as per the SVR4 ABI.  A value of 0 means we
     have no such handler.  */
  #define ELF_PLAT_INIT(_r, load_addr)	(_r)->ARM_r0 = 0
  
diff --color -rcNP Master/arch/arm/include/asm/elf.h.orig OG/arch/arm/include/asm/elf.h.orig
*** Master/arch/arm/include/asm/elf.h.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/arch/arm/include/asm/elf.h.orig	2021-04-20 15:11:27.308000000 -0400
***************
*** 0 ****
--- 1,152 ----
+ /* SPDX-License-Identifier: GPL-2.0 */
+ #ifndef __ASMARM_ELF_H
+ #define __ASMARM_ELF_H
+ 
+ #include <asm/auxvec.h>
+ #include <asm/hwcap.h>
+ #include <asm/vdso_datapage.h>
+ 
+ /*
+  * ELF register definitions..
+  */
+ #include <asm/ptrace.h>
+ #include <asm/user.h>
+ 
+ struct task_struct;
+ 
+ typedef unsigned long elf_greg_t;
+ typedef unsigned long elf_freg_t[3];
+ 
+ #define ELF_NGREG (sizeof (struct pt_regs) / sizeof(elf_greg_t))
+ typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+ 
+ typedef struct user_fp elf_fpregset_t;
+ 
+ #define EF_ARM_EABI_MASK	0xff000000
+ #define EF_ARM_EABI_UNKNOWN	0x00000000
+ #define EF_ARM_EABI_VER1	0x01000000
+ #define EF_ARM_EABI_VER2	0x02000000
+ #define EF_ARM_EABI_VER3	0x03000000
+ #define EF_ARM_EABI_VER4	0x04000000
+ #define EF_ARM_EABI_VER5	0x05000000
+ 
+ #define EF_ARM_BE8		0x00800000	/* ABI 4,5 */
+ #define EF_ARM_LE8		0x00400000	/* ABI 4,5 */
+ #define EF_ARM_MAVERICK_FLOAT	0x00000800	/* ABI 0 */
+ #define EF_ARM_VFP_FLOAT	0x00000400	/* ABI 0 */
+ #define EF_ARM_SOFT_FLOAT	0x00000200	/* ABI 0 */
+ #define EF_ARM_OLD_ABI		0x00000100	/* ABI 0 */
+ #define EF_ARM_NEW_ABI		0x00000080	/* ABI 0 */
+ #define EF_ARM_ALIGN8		0x00000040	/* ABI 0 */
+ #define EF_ARM_PIC		0x00000020	/* ABI 0 */
+ #define EF_ARM_MAPSYMSFIRST	0x00000010	/* ABI 2 */
+ #define EF_ARM_APCS_FLOAT	0x00000010	/* ABI 0, floats in fp regs */
+ #define EF_ARM_DYNSYMSUSESEGIDX	0x00000008	/* ABI 2 */
+ #define EF_ARM_APCS_26		0x00000008	/* ABI 0 */
+ #define EF_ARM_SYMSARESORTED	0x00000004	/* ABI 1,2 */
+ #define EF_ARM_INTERWORK	0x00000004	/* ABI 0 */
+ #define EF_ARM_HASENTRY		0x00000002	/* All */
+ #define EF_ARM_RELEXEC		0x00000001	/* All */
+ 
+ #define R_ARM_NONE		0
+ #define R_ARM_PC24		1
+ #define R_ARM_ABS32		2
+ #define R_ARM_CALL		28
+ #define R_ARM_JUMP24		29
+ #define R_ARM_TARGET1		38
+ #define R_ARM_V4BX		40
+ #define R_ARM_PREL31		42
+ #define R_ARM_MOVW_ABS_NC	43
+ #define R_ARM_MOVT_ABS		44
+ 
+ #define R_ARM_THM_CALL		10
+ #define R_ARM_THM_JUMP24	30
+ #define R_ARM_THM_MOVW_ABS_NC	47
+ #define R_ARM_THM_MOVT_ABS	48
+ 
+ /*
+  * These are used to set parameters in the core dumps.
+  */
+ #define ELF_CLASS	ELFCLASS32
+ #ifdef __ARMEB__
+ #define ELF_DATA	ELFDATA2MSB
+ #else
+ #define ELF_DATA	ELFDATA2LSB
+ #endif
+ #define ELF_ARCH	EM_ARM
+ 
+ /*
+  * This yields a string that ld.so will use to load implementation
+  * specific libraries for optimization.  This is more specific in
+  * intent than poking at uname or /proc/cpuinfo.
+  *
+  * For now we just provide a fairly general string that describes the
+  * processor family.  This could be made more specific later if someone
+  * implemented optimisations that require it.  26-bit CPUs give you
+  * "v1l" for ARM2 (no SWP) and "v2l" for anything else (ARM1 isn't
+  * supported).  32-bit CPUs give you "v3[lb]" for anything based on an
+  * ARM6 or ARM7 core and "armv4[lb]" for anything based on a StrongARM-1
+  * core.
+  */
+ #define ELF_PLATFORM_SIZE 8
+ #define ELF_PLATFORM	(elf_platform)
+ 
+ extern char elf_platform[];
+ 
+ struct elf32_hdr;
+ 
+ /*
+  * This is used to ensure we don't load something for the wrong architecture.
+  */
+ extern int elf_check_arch(const struct elf32_hdr *);
+ #define elf_check_arch elf_check_arch
+ 
+ #define ELFOSABI_ARM_FDPIC  65	/* ARM FDPIC platform */
+ #define elf_check_fdpic(x)  ((x)->e_ident[EI_OSABI] == ELFOSABI_ARM_FDPIC)
+ #define elf_check_const_displacement(x)  ((x)->e_flags & EF_ARM_PIC)
+ #define ELF_FDPIC_CORE_EFLAGS  0
+ 
+ #define vmcore_elf64_check_arch(x) (0)
+ 
+ extern int arm_elf_read_implies_exec(int);
+ #define elf_read_implies_exec(ex,stk) arm_elf_read_implies_exec(stk)
+ 
+ struct task_struct;
+ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
+ #define ELF_CORE_COPY_TASK_REGS dump_task_regs
+ 
+ #define CORE_DUMP_USE_REGSET
+ #define ELF_EXEC_PAGESIZE	4096
+ 
+ /* This is the base location for PIE (ET_DYN with INTERP) loads. */
+ #define ELF_ET_DYN_BASE		0x400000UL
+ 
+ /* When the program starts, a1 contains a pointer to a function to be 
+    registered with atexit, as per the SVR4 ABI.  A value of 0 means we 
+    have no such handler.  */
+ #define ELF_PLAT_INIT(_r, load_addr)	(_r)->ARM_r0 = 0
+ 
+ #define ELF_FDPIC_PLAT_INIT(_r, _exec_map_addr, _interp_map_addr, dynamic_addr) \
+ 	do { \
+ 		(_r)->ARM_r7 = _exec_map_addr; \
+ 		(_r)->ARM_r8 = _interp_map_addr; \
+ 		(_r)->ARM_r9 = dynamic_addr; \
+ 	} while(0)
+ 
+ extern void elf_set_personality(const struct elf32_hdr *);
+ #define SET_PERSONALITY(ex)	elf_set_personality(&(ex))
+ 
+ #ifdef CONFIG_MMU
+ #ifdef CONFIG_VDSO
+ #define ARCH_DLINFO						\
+ do {								\
+ 	NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
+ 		    (elf_addr_t)current->mm->context.vdso);	\
+ } while (0)
+ #endif
+ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+ struct linux_binprm;
+ int arch_setup_additional_pages(struct linux_binprm *, int);
+ #endif
+ 
+ #endif
diff --color -rcNP Master/arch/arm/include/asm/elf.h.rej OG/arch/arm/include/asm/elf.h.rej
*** Master/arch/arm/include/asm/elf.h.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/arch/arm/include/asm/elf.h.rej	2021-04-20 15:11:27.308000000 -0400
***************
*** 0 ****
--- 1,28 ----
+ *** arch/arm/include/asm/elf.h	2021-03-13 13:05:56.000000000 +0200
+ --- arch/arm/include/asm/elf.h	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 121,135 ****
+   /* This is the base location for PIE (ET_DYN with INTERP) loads. */
+   #define ELF_ET_DYN_BASE		0x400000UL
+   
+ ! #ifdef CONFIG_MINISEC_ASLR
+ ! #define PAX_ELF_ET_DYN_BASE	0x00008000UL
+ ! 
+ ! #define PAX_DELTA_MMAP_LEN	((current->personality == PER_LINUX_32BIT) ? 16 : 10)
+ ! #define PAX_DELTA_STACK_LEN	((current->personality == PER_LINUX_32BIT) ? 16 : 10)
+ ! #endif
+ ! 
+ ! /* When the program starts, a1 contains a pointer to a function to be
+ !    registered with atexit, as per the SVR4 ABI.  A value of 0 means we
+      have no such handler.  */
+   #define ELF_PLAT_INIT(_r, load_addr)	(_r)->ARM_r0 = 0
+   
+ --- 121,128 ----
+   /* This is the base location for PIE (ET_DYN with INTERP) loads. */
+   #define ELF_ET_DYN_BASE		0x400000UL
+   
+ ! /* When the program starts, a1 contains a pointer to a function to be
+ !    registered with atexit, as per the SVR4 ABI.  A value of 0 means we
+      have no such handler.  */
+   #define ELF_PLAT_INIT(_r, load_addr)	(_r)->ARM_r0 = 0
+   
diff --color -rcNP Master/arch/arm/mm/fault.c OG/arch/arm/mm/fault.c
*** Master/arch/arm/mm/fault.c	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/arm/mm/fault.c	2021-04-20 15:11:34.498000000 -0400
***************
*** 157,162 ****
--- 157,169 ----
  				   tsk->comm, addr);
  #endif
  
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if ((tsk->mm->pax_flags & MF_PAX_PAGEEXEC) && (fsr & FSR_LNX_PF)) {
+ 		pax_report_fault(regs, (void *)regs->ARM_pc, (void *)regs->ARM_sp);
+ 		do_group_exit(SIGKILL);
+ 	}
+ #endif
+ 
  	tsk->thread.address = addr;
  	tsk->thread.error_code = fsr;
  	tsk->thread.trap_no = 14;
***************
*** 384,389 ****
--- 391,423 ----
  }
  #endif					/* CONFIG_MMU */
  
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ void pax_report_insns(struct pt_regs *regs, void *pc, void *sp)
+ {
+ 	long i;
+ 
+ 	printk(KERN_ERR "PAX: bytes at PC: ");
+ 	for (i = 0; i < 20; i++) {
+ 		unsigned char c;
+ 		if (get_user(c, (__force unsigned char __user *)pc+i))
+ 			printk(KERN_CONT "?? ");
+ 		else
+ 			printk(KERN_CONT "%02x ", c);
+ 	}
+ 	printk("\n");
+ 
+ 	printk(KERN_ERR "PAX: bytes at SP-4: ");
+ 	for (i = -1; i < 20; i++) {
+ 		unsigned long c;
+ 		if (get_user(c, (__force unsigned long __user *)sp+i))
+ 			printk(KERN_CONT "???????? ");
+ 		else
+ 			printk(KERN_CONT "%08lx ", c);
+ 	}
+ 	printk("\n");
+ }
+ #endif
+ 
  /*
   * First Level Translation Fault Handler
   *
diff --color -rcNP Master/arch/x86/entry/entry_32.S OG/arch/x86/entry/entry_32.S
*** Master/arch/x86/entry/entry_32.S	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/entry/entry_32.S	2021-04-20 15:11:34.499000000 -0400
***************
*** 360,365 ****
--- 360,374 ----
  .Lend_\@:
  .endm
  
+ .macro SAVE_ALL pt_regs_ax=%eax
+ #if defined(CONFIG_MINISEC_KERNEXEC) || defined(CONFIG_MINISEC_PAGEEXEC)
+ 	__SAVE_ALL \pt_regs_ax, __KERNEL_DS
+ 	pax_enter_kernel
+ #else
+ 	__SAVE_ALL \pt_regs_ax, __USER_DS
+ #endif
+ .endm
+ 
  .macro RESTORE_INT_REGS
  	popl	%ebx
  	popl	%ecx
***************
*** 1077,1084 ****
  	 */
  	TRACE_IRQS_OFF
  
- 	movl	%esp, %eax
- 	call	do_int80_syscall_32
  .Lsyscall_32_done:
  
  	STACKLEAK_ERASE
--- 1086,1091 ----
diff --color -rcNP Master/arch/x86/entry/entry_64.S OG/arch/x86/entry/entry_64.S
*** Master/arch/x86/entry/entry_64.S	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/entry/entry_64.S	2021-04-20 15:11:34.499000000 -0400
***************
*** 53,58 ****
--- 53,67 ----
  END(native_usergs_sysret64)
  #endif /* CONFIG_PARAVIRT */
  
+ #ifdef CONFIG_MINISEC_RANDKSTACK
+ .macro PAX_RAND_KSTACK
+ 	movq  %rsp, %rdi
+ 	call  pax_randomize_kstack
+ 	movq  %rsp, %rdi
+ 	movq  %rax, %rsp
+ .endm
+ #endif
+ 
  .macro TRACE_IRQS_FLAGS flags:req
  #ifdef CONFIG_TRACE_IRQFLAGS
  	btl	$9, \flags		/* interrupts off? */
***************
*** 170,178 ****
--- 179,205 ----
  	TRACE_IRQS_OFF
  
  	/* IRQs are off. */
+ 	/*
+ 	 * do_syscall_64 expects syscall-nr (pt_regs->orig_ax) as the first
+ 	 * argument (%rdi) and pointer to pt_regs as the second argument (%rsi).
+ 	 */
+ #ifdef CONFIG_MINISEC_RANDKSTACK
+ 	pushq	%rax
+ 	movq	%rsp, %rdi
+ 	call	pax_randomize_kstack
+ 	popq	%rdi
+ 	movq	%rsp, %rsi
+ 	movq	%rax, %rsp
+ 
+ 	pushq	%rsi
+ #else
  	movq	%rax, %rdi
  	movq	%rsp, %rsi
+ #endif
  	call	do_syscall_64		/* returns with IRQs disabled */
+ #ifdef CONFIG_MINISEC_RANDKSTACK
+ 	popq	%rsp
+ #endif
  
  	TRACE_IRQS_IRETQ		/* we're about to change IF */
  
***************
*** 340,348 ****
--- 367,383 ----
  
  2:
  	UNWIND_HINT_REGS
+ #ifdef CONFIG_MINISEC_RANDKSTACK
+   	PAX_RAND_KSTACK
+   	pushq	%rdi
+ #else
  	movq	%rsp, %rdi
+ #endif
  	call	syscall_return_slowpath	/* returns with IRQs disabled */
  	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
+ #ifdef CONFIG_MINISEC_RANDKSTACK
+   	popq	%rsp
+ #endif
  	jmp	swapgs_restore_regs_and_return_to_usermode
  
  1:
diff --color -rcNP Master/arch/x86/include/asm/elf.h OG/arch/x86/include/asm/elf.h
*** Master/arch/x86/include/asm/elf.h	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/include/asm/elf.h	2021-04-20 15:11:34.499000000 -0400
***************
*** 254,259 ****
--- 254,273 ----
  #define ELF_ET_DYN_BASE		(mmap_is_ia32() ? 0x000400000UL : \
  						  0x100000000UL)
  
+ #ifdef CONFIG_MINISEC_ASLR
+ #ifdef CONFIG_X86_32
+ #define PAX_ELF_ET_DYN_BASE	0x10000000UL
+ 
+ #define PAX_DELTA_MMAP_LEN	(current->mm->pax_flags & MF_PAX_SEGMEXEC ? 15 : 16)
+ #define PAX_DELTA_STACK_LEN	(current->mm->pax_flags & MF_PAX_SEGMEXEC ? 15 : 16)
+ #else
+ #define PAX_ELF_ET_DYN_BASE	0x400000UL
+ 
+ #define PAX_DELTA_MMAP_LEN	((test_thread_flag(TIF_ADDR32)) ? 16 : CONFIG_TASK_SIZE_MAX_SHIFT - PAGE_SHIFT - 3)
+ #define PAX_DELTA_STACK_LEN	((test_thread_flag(TIF_ADDR32)) ? 16 : CONFIG_TASK_SIZE_MAX_SHIFT - PAGE_SHIFT - 3)
+ #endif
+ #endif
+ 
  /* This yields a mask that user programs can use to figure out what
     instruction set this CPU supports.  This could be done in user space,
     but it's not easy, and we've already done it here.  */
diff --color -rcNP Master/arch/x86/include/asm/mmu.h OG/arch/x86/include/asm/mmu.h
*** Master/arch/x86/include/asm/mmu.h	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/include/asm/mmu.h	2021-04-20 15:11:34.499000000 -0400
***************
*** 38,46 ****
  #endif
  
  	struct mutex lock;
! 	void __user *vdso;			/* vdso base address */
  	const struct vdso_image *vdso_image;	/* vdso image in use */
  
  	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
  #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
  	/*
--- 38,58 ----
  #endif
  
  	struct mutex lock;
! 	unsigned long *vdso;			/* vdso base address */
  	const struct vdso_image *vdso_image;	/* vdso image in use */
  
+ #ifdef CONFIG_X86_32
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ 			unsigned long user_cs_base;
+ 			unsigned long user_cs_limit;
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_SMP)
+ 			cpumask_t cpu_user_cs_mask;
+ #endif
+ 
+ #endif
+ #endif
+ 
  	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
  #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
  	/*
diff --color -rcNP Master/arch/x86/include/asm/mmu_context.h OG/arch/x86/include/asm/mmu_context.h
*** Master/arch/x86/include/asm/mmu_context.h	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/include/asm/mmu_context.h	2021-04-20 15:11:34.500000000 -0400
***************
*** 81,86 ****
--- 81,99 ----
  static inline void init_new_context_ldt(struct mm_struct *mm)
  {
  	mm->context.ldt = NULL;
+ 
+ #ifdef CONFIG_X86_32
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ 							mm->context.user_cs_base = 0UL;
+ 							mm->context.user_cs_limit = ~0UL;
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_SMP)
+ 							cpumask_clear(&mm->context.cpu_user_cs_mask);
+ #endif
+ 
+ #endif
+ #endif
+ 
  	init_rwsem(&mm->context.ldt_usr_sem);
  }
  int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
diff --color -rcNP Master/arch/x86/kernel/cpu/common.c OG/arch/x86/kernel/cpu/common.c
*** Master/arch/x86/kernel/cpu/common.c	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/kernel/cpu/common.c	2021-04-20 15:11:34.500000000 -0400
***************
*** 1524,1529 ****
--- 1524,1539 ----
  	setup_smap(c);
  	setup_umip(c);
  
+ #ifdef CONFIG_X86_32
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 			if (!(__supported_pte_mask & _PAGE_NX))
+ 							clear_cpu_cap(c, X86_FEATURE_PSE);
+ #endif
+ #ifdef CONFIG_MINISEC_KERNEXEC
+ 			clear_cpu_cap(c, X86_FEATURE_SEP);
+ #endif
+ #endif
+ 
  	/*
  	 * The vendor-specific functions might have changed features.
  	 * Now we do "generic changes."
diff --color -rcNP Master/arch/x86/kernel/process_64.c OG/arch/x86/kernel/process_64.c
*** Master/arch/x86/kernel/process_64.c	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/kernel/process_64.c	2021-04-20 15:11:34.500000000 -0400
***************
*** 63,68 ****
--- 63,83 ----
  
  #include "process.h"
  
+ #ifdef CONFIG_MINISEC_RANDKSTACK
+ unsigned long pax_randomize_kstack(struct pt_regs *regs)
+ {
+ 	unsigned long time;
+ 	unsigned long sp1;
+ 
+ 	if (!randomize_va_space)
+ 		return (unsigned long)regs;
+ 
+ 	time = rdtsc() & 0xFUL;
+ 	sp1 = (unsigned long)regs - (time << 4);
+ 	return sp1;
+ }
+ #endif
+ 
  /* Prints also some state that isn't saved in the pt_regs */
  void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  {
diff --color -rcNP Master/arch/x86/mm/fault.c OG/arch/x86/mm/fault.c
*** Master/arch/x86/mm/fault.c	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/mm/fault.c	2021-04-20 15:11:34.500000000 -0400
***************
*** 142,147 ****
--- 142,155 ----
  	return prefetch;
  }
  
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ static bool pax_is_fetch_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ static int pax_handle_fetch_fault(struct pt_regs *regs);
+ #endif
+ 
  DEFINE_SPINLOCK(pgd_lock);
  LIST_HEAD(pgd_list);
  
***************
*** 420,426 ****
  
  #ifdef CONFIG_CPU_SUP_AMD
  static const char errata93_warning[] =
! KERN_ERR 
  "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
  "******* Working around it, but it may cause SEGVs or burn power.\n"
  "******* Please consider a BIOS update.\n"
--- 428,434 ----
  
  #ifdef CONFIG_CPU_SUP_AMD
  static const char errata93_warning[] =
! KERN_ERR
  "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
  "******* Working around it, but it may cause SEGVs or burn power.\n"
  "******* Please consider a BIOS update.\n"
***************
*** 907,912 ****
--- 915,935 ----
  		if (is_errata100(regs, address))
  			return;
  
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 		if (pax_is_fetch_fault(regs, error_code, address)) {
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 			switch (pax_handle_fetch_fault(regs)) {
+ 			case 2:
+ 				return;
+ 			}
+ #endif
+ 
+ 			pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp);
+ 			do_group_exit(SIGKILL);
+ 		}
+ #endif
+ 
  		/*
  		 * To avoid leaking information about the kernel page table
  		 * layout, pretend that user-mode accesses to kernel addresses
***************
*** 1096,1101 ****
--- 1119,1226 ----
  	return 1;
  }
  
+ #if defined(CONFIG_X86_32) && defined(CONFIG_MINISEC_PAGEEXEC)
+ static inline unsigned long get_limit(unsigned long segment)
+ {
+ 	unsigned long __limit;
+ 
+ 	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+ 	return __limit + 1;
+ }
+ 
+ static int pax_handle_pageexec_fault(struct pt_regs *regs, struct mm_struct *mm, unsigned long address, unsigned long error_code)
+ {
+ 	pte_t *pte;
+ 	pmd_t *pmd;
+ 	spinlock_t *ptl;
+ 	unsigned char pte_mask;
+ 
+ 	if ((__supported_pte_mask & _PAGE_NX) || (error_code & (PF_PROT|PF_USER)) != (PF_PROT|PF_USER) || v8086_mode(regs) ||
+ 	    !(mm->pax_flags & MF_PAX_PAGEEXEC))
+ 		return 0;
+ 
+ 	/* PaX: it's our fault, let's handle it if we can */
+ 
+ 	/* PaX: take a look at read faults before acquiring any locks */
+ 	if (unlikely(!(error_code & PF_WRITE) && (regs->ip == address))) {
+ 		/* instruction fetch attempt from a protected page in user mode */
+ 		up_read(&mm->mmap_sem);
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 		switch (pax_handle_fetch_fault(regs)) {
+ 		case 2:
+ 			return 1;
+ 		}
+ #endif
+ 
+ 		pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp);
+ 		do_group_exit(SIGKILL);
+ 	}
+ 
+ 	pmd = pax_get_pmd(mm, address);
+ 	if (unlikely(!pmd))
+ 		return 0;
+ 
+ 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ 	if (unlikely(!(pte_val(*pte) & _PAGE_PRESENT) || pte_user(*pte))) {
+ 		pte_unmap_unlock(pte, ptl);
+ 		return 0;
+ 	}
+ 
+ 	if (unlikely((error_code & PF_WRITE) && !pte_write(*pte))) {
+ 		/* write attempt to a protected page in user mode */
+ 		pte_unmap_unlock(pte, ptl);
+ 		return 0;
+ 	}
+ 
+ #ifdef CONFIG_SMP
+ 	if (likely(address > get_limit(regs->cs) && cpumask_test_cpu(smp_processor_id(), &mm->context.cpu_user_cs_mask)))
+ #else
+ 	if (likely(address > get_limit(regs->cs)))
+ #endif
+ 	{
+ 		set_pte(pte, pte_mkread(*pte));
+ 		__flush_tlb_one(address);
+ 		pte_unmap_unlock(pte, ptl);
+ 		up_read(&mm->mmap_sem);
+ 		return 1;
+ 	}
+ 
+ 	pte_mask = _PAGE_ACCESSED | _PAGE_USER | ((error_code & PF_WRITE) << (_PAGE_BIT_DIRTY-1));
+ 
+ 	/*
+ 	 * PaX: fill DTLB with user rights and retry
+ 	 */
+ 	__asm__ __volatile__ (
+ 		"orb %2,(%1)\n"
+ #if defined(CONFIG_M586) || defined(CONFIG_M586TSC)
+ /*
+  * PaX: let this uncommented 'invlpg' remind us on the behaviour of Intel's
+  * (and AMD's) TLBs. namely, they do not cache PTEs that would raise *any*
+  * page fault when examined during a TLB load attempt. this is true not only
+  * for PTEs holding a non-present entry but also present entries that will
+  * raise a page fault (such as those set up by PaX, or the copy-on-write
+  * mechanism). in effect it means that we do *not* need to flush the TLBs
+  * for our target pages since their PTEs are simply not in the TLBs at all.
+  * the best thing in omitting it is that we gain around 15-20% speed in the
+  * fast path of the page fault handler and can get rid of tracing since we
+  * can no longer flush unintended entries.
+  */
+ 		"invlpg (%0)\n"
+ #endif
+ 		ASM_STAC "\n"
+ 		__copyuser_seg"testb $0,(%0)\n"
+ 		ASM_CLAC "\n"
+ 		"xorb %3,(%1)\n"
+ 		:
+ 		: "r" (address), "r" (pte), "q" (pte_mask), "i" (_PAGE_USER)
+ 		: "memory", "cc");
+ 	pte_unmap_unlock(pte, ptl);
+ 	up_read(&mm->mmap_sem);
+ 	return 1;
+ }
+ #endif
+ 
  /*
   * Handle a spurious fault caused by a stale TLB entry.
   *
***************
*** 1421,1426 ****
--- 1546,1556 ----
  		might_sleep();
  	}
  
+ #if defined(CONFIG_X86_32) && defined(CONFIG_MINISEC_PAGEEXEC)
+ 	if (pax_handle_pageexec_fault(regs, mm, address, error_code))
+ 		return;
+ #endif
+ 
  	vma = find_vma(mm, address);
  	if (unlikely(!vma)) {
  		bad_area(regs, hw_error_code, address);
***************
*** 1553,1555 ****
--- 1683,1941 ----
  	exception_exit(prev_state);
  }
  NOKPROBE_SYMBOL(do_page_fault);
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ static bool pax_is_fetch_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ {
+ 	struct mm_struct *mm = current->mm;
+ 	unsigned long ip = regs->ip;
+ 
+ 	if (v8086_mode(regs))
+ 		ip = ((regs->cs & 0xffff) << 4) + (ip & 0xffff);
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (mm->pax_flags & MF_PAX_PAGEEXEC) {
+ 		if ((__supported_pte_mask & _PAGE_NX) && (error_code & X86_PF_INSTR))
+ 			return true;
+ 		if (!(error_code & (X86_PF_PROT | X86_PF_WRITE)) && ip == address)
+ 			return true;
+ 		return false;
+ 	}
+ #endif
+ 
+ 	return false;
+ }
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ static int pax_handle_fetch_fault_32(struct pt_regs *regs)
+ {
+ 	int err;
+ 
+ 	do { /* PaX: libffi trampoline emulation */
+ 		unsigned char mov, jmp;
+ 		unsigned int addr1, addr2;
+ 
+ #ifdef CONFIG_X86_64
+ 		if ((regs->ip + 9) >> 32)
+ 			break;
+ #endif
+ 
+ 		err = get_user(mov, (unsigned char __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1));
+ 		err |= get_user(jmp, (unsigned char __user *)(regs->ip + 5));
+ 		err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov == 0xB8 && jmp == 0xE9) {
+ 			regs->ax = addr1;
+ 			regs->ip = (unsigned int)(regs->ip + addr2 + 10);
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #1 */
+ 		unsigned char mov1, mov2;
+ 		unsigned short jmp;
+ 		unsigned int addr1, addr2;
+ 
+ #ifdef CONFIG_X86_64
+ 		if ((regs->ip + 11) >> 32)
+ 			break;
+ #endif
+ 
+ 		err = get_user(mov1, (unsigned char __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1));
+ 		err |= get_user(mov2, (unsigned char __user *)(regs->ip + 5));
+ 		err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6));
+ 		err |= get_user(jmp, (unsigned short __user *)(regs->ip + 10));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xB9 && mov2 == 0xB8 && jmp == 0xE0FF) {
+ 			regs->cx = addr1;
+ 			regs->ax = addr2;
+ 			regs->ip = addr2;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #2 */
+ 		unsigned char mov, jmp;
+ 		unsigned int addr1, addr2;
+ 
+ #ifdef CONFIG_X86_64
+ 		if ((regs->ip + 9) >> 32)
+ 			break;
+ #endif
+ 
+ 		err = get_user(mov, (unsigned char __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1));
+ 		err |= get_user(jmp, (unsigned char __user *)(regs->ip + 5));
+ 		err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov == 0xB9 && jmp == 0xE9) {
+ 			regs->cx = addr1;
+ 			regs->ip = (unsigned int)(regs->ip + addr2 + 10);
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	return 1; /* PaX in action */
+ }
+ 
+ #ifdef CONFIG_X86_64
+ static int pax_handle_fetch_fault_64(struct pt_regs *regs)
+ {
+ 	int err;
+ 
+ 	do { /* PaX: libffi trampoline emulation */
+ 		unsigned short mov1, mov2, jmp1;
+ 		unsigned char stcclc, jmp2;
+ 		unsigned long addr1, addr2;
+ 
+ 		err = get_user(mov1, (unsigned short __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned long __user *)(regs->ip + 2));
+ 		err |= get_user(mov2, (unsigned short __user *)(regs->ip + 10));
+ 		err |= get_user(addr2, (unsigned long __user *)(regs->ip + 12));
+ 		err |= get_user(stcclc, (unsigned char __user *)(regs->ip + 20));
+ 		err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 21));
+ 		err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 23));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xBB49 && mov2 == 0xBA49 && (stcclc == 0xF8 || stcclc == 0xF9) && jmp1 == 0xFF49 && jmp2 == 0xE3) {
+ 			regs->r11 = addr1;
+ 			regs->r10 = addr2;
+ 			if (stcclc == 0xF8)
+ 				regs->flags &= ~X86_EFLAGS_CF;
+ 			else
+ 				regs->flags |= X86_EFLAGS_CF;
+ 			regs->ip = addr1;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #1 */
+ 		unsigned short mov1, mov2, jmp1;
+ 		unsigned char jmp2;
+ 		unsigned int addr1;
+ 		unsigned long addr2;
+ 
+ 		err = get_user(mov1, (unsigned short __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 2));
+ 		err |= get_user(mov2, (unsigned short __user *)(regs->ip + 6));
+ 		err |= get_user(addr2, (unsigned long __user *)(regs->ip + 8));
+ 		err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 16));
+ 		err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 18));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xBB41 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) {
+ 			regs->r11 = addr1;
+ 			regs->r10 = addr2;
+ 			regs->ip = addr1;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #2 */
+ 		unsigned short mov1, mov2, jmp1;
+ 		unsigned char jmp2;
+ 		unsigned long addr1, addr2;
+ 
+ 		err = get_user(mov1, (unsigned short __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned long __user *)(regs->ip + 2));
+ 		err |= get_user(mov2, (unsigned short __user *)(regs->ip + 10));
+ 		err |= get_user(addr2, (unsigned long __user *)(regs->ip + 12));
+ 		err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 20));
+ 		err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 22));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xBB49 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) {
+ 			regs->r11 = addr1;
+ 			regs->r10 = addr2;
+ 			regs->ip = addr1;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	return 1; /* PaX in action */
+ }
+ #endif
+ 
+ /*
+  * PaX: decide what to do with offenders (regs->ip = fault address)
+  *
+  * returns 1 when task should be killed
+  *         2 when gcc trampoline was detected
+  */
+ static int pax_handle_fetch_fault(struct pt_regs *regs)
+ {
+ 	if (v8086_mode(regs))
+ 		return 1;
+ 
+ 	if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP))
+ 		return 1;
+ 
+ #ifdef CONFIG_X86_32
+ 	return pax_handle_fetch_fault_32(regs);
+ #else
+ 	if (regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT))
+ 		return pax_handle_fetch_fault_32(regs);
+ 	else
+ 		return pax_handle_fetch_fault_64(regs);
+ #endif
+ }
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ void pax_report_insns(struct pt_regs *regs, void *pc, void *sp)
+ {
+ 	long i;
+ 
+ 	printk(KERN_ERR "PAX: bytes at PC: ");
+ 	for (i = 0; i < 20; i++) {
+ 		unsigned char c;
+ 		if (get_user(c, (unsigned char *)pc+i))
+ 			printk(KERN_CONT "?? ");
+ 		else
+ 			printk(KERN_CONT "%02x ", c);
+ 	}
+ 	printk("\n");
+ 
+ 	printk(KERN_ERR "PAX: bytes at SP-%lu: ", (unsigned long)sizeof(long));
+ 	for (i = -1; i < 80 / (long)sizeof(long); i++) {
+ 		unsigned long c;
+ 		if (get_user(c, (unsigned long *)sp+i)) {
+ #ifdef CONFIG_X86_32
+ 			printk(KERN_CONT "???????? ");
+ #else
+ 			if ((regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)))
+ 				printk(KERN_CONT "???????? ???????? ");
+ 			else
+ 				printk(KERN_CONT "???????????????? ");
+ #endif
+ 		} else {
+ #ifdef CONFIG_X86_64
+ 			if ((regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT))) {
+ 				printk(KERN_CONT "%08x ", (unsigned int)c);
+ 				printk(KERN_CONT "%08x ", (unsigned int)(c >> 32));
+ 			} else
+ #endif
+ 				printk(KERN_CONT "%0*lx ", 2 * (int)sizeof(long), c);
+ 		}
+ 	}
+ 	printk("\n");
+ }
+ #endif
diff --color -rcNP Master/arch/x86/mm/fault.c.orig OG/arch/x86/mm/fault.c.orig
*** Master/arch/x86/mm/fault.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/arch/x86/mm/fault.c.orig	2021-04-20 15:10:45.375000000 -0400
***************
*** 0 ****
--- 1,1941 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  *  Copyright (C) 1995  Linus Torvalds
+  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
+  */
+ #include <linux/sched.h>		/* test_thread_flag(), ...	*/
+ #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
+ #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
+ #include <linux/extable.h>		/* search_exception_tables	*/
+ #include <linux/memblock.h>		/* max_low_pfn			*/
+ #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
+ #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
+ #include <linux/perf_event.h>		/* perf_sw_event		*/
+ #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+ #include <linux/prefetch.h>		/* prefetchw			*/
+ #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+ #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
+ #include <linux/efi.h>			/* efi_recover_from_page_fault()*/
+ #include <linux/mm_types.h>
+ 
+ #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
+ #include <asm/traps.h>			/* dotraplinkage, ...		*/
+ #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
+ #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
+ #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+ #include <asm/vm86.h>			/* struct vm86			*/
+ #include <asm/mmu_context.h>		/* vma_pkey()			*/
+ #include <asm/efi.h>			/* efi_recover_from_page_fault()*/
+ #include <asm/desc.h>			/* store_idt(), ...		*/
+ #include <asm/cpu_entry_area.h>		/* exception stack		*/
+ 
+ #define CREATE_TRACE_POINTS
+ #include <asm/trace/exceptions.h>
+ 
+ /*
+  * Returns 0 if mmiotrace is disabled, or if the fault is not
+  * handled by mmiotrace:
+  */
+ static nokprobe_inline int
+ kmmio_fault(struct pt_regs *regs, unsigned long addr)
+ {
+ 	if (unlikely(is_kmmio_active()))
+ 		if (kmmio_handler(regs, addr) == 1)
+ 			return -1;
+ 	return 0;
+ }
+ 
+ /*
+  * Prefetch quirks:
+  *
+  * 32-bit mode:
+  *
+  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+  *   Check that here and ignore it.
+  *
+  * 64-bit mode:
+  *
+  *   Sometimes the CPU reports invalid exceptions on prefetch.
+  *   Check that here and ignore it.
+  *
+  * Opcode checker based on code by Richard Brunner.
+  */
+ static inline int
+ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+ 		      unsigned char opcode, int *prefetch)
+ {
+ 	unsigned char instr_hi = opcode & 0xf0;
+ 	unsigned char instr_lo = opcode & 0x0f;
+ 
+ 	switch (instr_hi) {
+ 	case 0x20:
+ 	case 0x30:
+ 		/*
+ 		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+ 		 * In X86_64 long mode, the CPU will signal invalid
+ 		 * opcode if some of these prefixes are present so
+ 		 * X86_64 will never get here anyway
+ 		 */
+ 		return ((instr_lo & 7) == 0x6);
+ #ifdef CONFIG_X86_64
+ 	case 0x40:
+ 		/*
+ 		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+ 		 * Need to figure out under what instruction mode the
+ 		 * instruction was issued. Could check the LDT for lm,
+ 		 * but for now it's good enough to assume that long
+ 		 * mode only uses well known segments or kernel.
+ 		 */
+ 		return (!user_mode(regs) || user_64bit_mode(regs));
+ #endif
+ 	case 0x60:
+ 		/* 0x64 thru 0x67 are valid prefixes in all modes. */
+ 		return (instr_lo & 0xC) == 0x4;
+ 	case 0xF0:
+ 		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+ 		return !instr_lo || (instr_lo>>1) == 1;
+ 	case 0x00:
+ 		/* Prefetch instruction is 0x0F0D or 0x0F18 */
+ 		if (probe_kernel_address(instr, opcode))
+ 			return 0;
+ 
+ 		*prefetch = (instr_lo == 0xF) &&
+ 			(opcode == 0x0D || opcode == 0x18);
+ 		return 0;
+ 	default:
+ 		return 0;
+ 	}
+ }
+ 
+ static int
+ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+ {
+ 	unsigned char *max_instr;
+ 	unsigned char *instr;
+ 	int prefetch = 0;
+ 
+ 	/*
+ 	 * If it was a exec (instruction fetch) fault on NX page, then
+ 	 * do not ignore the fault:
+ 	 */
+ 	if (error_code & X86_PF_INSTR)
+ 		return 0;
+ 
+ 	instr = (void *)convert_ip_to_linear(current, regs);
+ 	max_instr = instr + 15;
+ 
+ 	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
+ 		return 0;
+ 
+ 	while (instr < max_instr) {
+ 		unsigned char opcode;
+ 
+ 		if (probe_kernel_address(instr, opcode))
+ 			break;
+ 
+ 		instr++;
+ 
+ 		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
+ 			break;
+ 	}
+ 	return prefetch;
+ }
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ static bool pax_is_fetch_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ static int pax_handle_fetch_fault(struct pt_regs *regs);
+ #endif
+ 
+ DEFINE_SPINLOCK(pgd_lock);
+ LIST_HEAD(pgd_list);
+ 
+ #ifdef CONFIG_X86_32
+ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+ {
+ 	unsigned index = pgd_index(address);
+ 	pgd_t *pgd_k;
+ 	p4d_t *p4d, *p4d_k;
+ 	pud_t *pud, *pud_k;
+ 	pmd_t *pmd, *pmd_k;
+ 
+ 	pgd += index;
+ 	pgd_k = init_mm.pgd + index;
+ 
+ 	if (!pgd_present(*pgd_k))
+ 		return NULL;
+ 
+ 	/*
+ 	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+ 	 * and redundant with the set_pmd() on non-PAE. As would
+ 	 * set_p4d/set_pud.
+ 	 */
+ 	p4d = p4d_offset(pgd, address);
+ 	p4d_k = p4d_offset(pgd_k, address);
+ 	if (!p4d_present(*p4d_k))
+ 		return NULL;
+ 
+ 	pud = pud_offset(p4d, address);
+ 	pud_k = pud_offset(p4d_k, address);
+ 	if (!pud_present(*pud_k))
+ 		return NULL;
+ 
+ 	pmd = pmd_offset(pud, address);
+ 	pmd_k = pmd_offset(pud_k, address);
+ 
+ 	if (pmd_present(*pmd) != pmd_present(*pmd_k))
+ 		set_pmd(pmd, *pmd_k);
+ 
+ 	if (!pmd_present(*pmd_k))
+ 		return NULL;
+ 	else
+ 		BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
+ 
+ 	return pmd_k;
+ }
+ 
+ static void vmalloc_sync(void)
+ {
+ 	unsigned long address;
+ 
+ 	if (SHARED_KERNEL_PMD)
+ 		return;
+ 
+ 	for (address = VMALLOC_START & PMD_MASK;
+ 	     address >= TASK_SIZE_MAX && address < VMALLOC_END;
+ 	     address += PMD_SIZE) {
+ 		struct page *page;
+ 
+ 		spin_lock(&pgd_lock);
+ 		list_for_each_entry(page, &pgd_list, lru) {
+ 			spinlock_t *pgt_lock;
+ 
+ 			/* the pgt_lock only for Xen */
+ 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ 
+ 			spin_lock(pgt_lock);
+ 			vmalloc_sync_one(page_address(page), address);
+ 			spin_unlock(pgt_lock);
+ 		}
+ 		spin_unlock(&pgd_lock);
+ 	}
+ }
+ 
+ void vmalloc_sync_mappings(void)
+ {
+ 	vmalloc_sync();
+ }
+ 
+ void vmalloc_sync_unmappings(void)
+ {
+ 	vmalloc_sync();
+ }
+ 
+ /*
+  * 32-bit:
+  *
+  *   Handle a fault on the vmalloc or module mapping area
+  */
+ static noinline int vmalloc_fault(unsigned long address)
+ {
+ 	unsigned long pgd_paddr;
+ 	pmd_t *pmd_k;
+ 	pte_t *pte_k;
+ 
+ 	/* Make sure we are in vmalloc area: */
+ 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ 		return -1;
+ 
+ 	/*
+ 	 * Synchronize this task's top level page-table
+ 	 * with the 'reference' page table.
+ 	 *
+ 	 * Do _not_ use "current" here. We might be inside
+ 	 * an interrupt in the middle of a task switch..
+ 	 */
+ 	pgd_paddr = read_cr3_pa();
+ 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ 	if (!pmd_k)
+ 		return -1;
+ 
+ 	if (pmd_large(*pmd_k))
+ 		return 0;
+ 
+ 	pte_k = pte_offset_kernel(pmd_k, address);
+ 	if (!pte_present(*pte_k))
+ 		return -1;
+ 
+ 	return 0;
+ }
+ NOKPROBE_SYMBOL(vmalloc_fault);
+ 
+ /*
+  * Did it hit the DOS screen memory VA from vm86 mode?
+  */
+ static inline void
+ check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ 		 struct task_struct *tsk)
+ {
+ #ifdef CONFIG_VM86
+ 	unsigned long bit;
+ 
+ 	if (!v8086_mode(regs) || !tsk->thread.vm86)
+ 		return;
+ 
+ 	bit = (address - 0xA0000) >> PAGE_SHIFT;
+ 	if (bit < 32)
+ 		tsk->thread.vm86->screen_bitmap |= 1 << bit;
+ #endif
+ }
+ 
+ static bool low_pfn(unsigned long pfn)
+ {
+ 	return pfn < max_low_pfn;
+ }
+ 
+ static void dump_pagetable(unsigned long address)
+ {
+ 	pgd_t *base = __va(read_cr3_pa());
+ 	pgd_t *pgd = &base[pgd_index(address)];
+ 	p4d_t *p4d;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+ 
+ #ifdef CONFIG_X86_PAE
+ 	pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
+ 	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
+ 		goto out;
+ #define pr_pde pr_cont
+ #else
+ #define pr_pde pr_info
+ #endif
+ 	p4d = p4d_offset(pgd, address);
+ 	pud = pud_offset(p4d, address);
+ 	pmd = pmd_offset(pud, address);
+ 	pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+ #undef pr_pde
+ 
+ 	/*
+ 	 * We must not directly access the pte in the highpte
+ 	 * case if the page table is located in highmem.
+ 	 * And let's rather not kmap-atomic the pte, just in case
+ 	 * it's allocated already:
+ 	 */
+ 	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+ 		goto out;
+ 
+ 	pte = pte_offset_kernel(pmd, address);
+ 	pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+ out:
+ 	pr_cont("\n");
+ }
+ 
+ #else /* CONFIG_X86_64: */
+ 
+ void vmalloc_sync_mappings(void)
+ {
+ 	/*
+ 	 * 64-bit mappings might allocate new p4d/pud pages
+ 	 * that need to be propagated to all tasks' PGDs.
+ 	 */
+ 	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+ }
+ 
+ void vmalloc_sync_unmappings(void)
+ {
+ 	/*
+ 	 * Unmappings never allocate or free p4d/pud pages.
+ 	 * No work is required here.
+ 	 */
+ }
+ 
+ /*
+  * 64-bit:
+  *
+  *   Handle a fault on the vmalloc area
+  */
+ static noinline int vmalloc_fault(unsigned long address)
+ {
+ 	pgd_t *pgd, *pgd_k;
+ 	p4d_t *p4d, *p4d_k;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+ 
+ 	/* Make sure we are in vmalloc area: */
+ 	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ 		return -1;
+ 
+ 	/*
+ 	 * Copy kernel mappings over when needed. This can also
+ 	 * happen within a race in page table update. In the later
+ 	 * case just flush:
+ 	 */
+ 	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
+ 	pgd_k = pgd_offset_k(address);
+ 	if (pgd_none(*pgd_k))
+ 		return -1;
+ 
+ 	if (pgtable_l5_enabled()) {
+ 		if (pgd_none(*pgd)) {
+ 			set_pgd(pgd, *pgd_k);
+ 			arch_flush_lazy_mmu_mode();
+ 		} else {
+ 			BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
+ 		}
+ 	}
+ 
+ 	/* With 4-level paging, copying happens on the p4d level. */
+ 	p4d = p4d_offset(pgd, address);
+ 	p4d_k = p4d_offset(pgd_k, address);
+ 	if (p4d_none(*p4d_k))
+ 		return -1;
+ 
+ 	if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
+ 		set_p4d(p4d, *p4d_k);
+ 		arch_flush_lazy_mmu_mode();
+ 	} else {
+ 		BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
+ 	}
+ 
+ 	BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
+ 
+ 	pud = pud_offset(p4d, address);
+ 	if (pud_none(*pud))
+ 		return -1;
+ 
+ 	if (pud_large(*pud))
+ 		return 0;
+ 
+ 	pmd = pmd_offset(pud, address);
+ 	if (pmd_none(*pmd))
+ 		return -1;
+ 
+ 	if (pmd_large(*pmd))
+ 		return 0;
+ 
+ 	pte = pte_offset_kernel(pmd, address);
+ 	if (!pte_present(*pte))
+ 		return -1;
+ 
+ 	return 0;
+ }
+ NOKPROBE_SYMBOL(vmalloc_fault);
+ 
+ #ifdef CONFIG_CPU_SUP_AMD
+ static const char errata93_warning[] =
+ KERN_ERR 
+ "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+ "******* Working around it, but it may cause SEGVs or burn power.\n"
+ "******* Please consider a BIOS update.\n"
+ "******* Disabling USB legacy in the BIOS may also help.\n";
+ #endif
+ 
+ /*
+  * No vm86 mode in 64-bit mode:
+  */
+ static inline void
+ check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ 		 struct task_struct *tsk)
+ {
+ }
+ 
+ static int bad_address(void *p)
+ {
+ 	unsigned long dummy;
+ 
+ 	return probe_kernel_address((unsigned long *)p, dummy);
+ }
+ 
+ static void dump_pagetable(unsigned long address)
+ {
+ 	pgd_t *base = __va(read_cr3_pa());
+ 	pgd_t *pgd = base + pgd_index(address);
+ 	p4d_t *p4d;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+ 
+ 	if (bad_address(pgd))
+ 		goto bad;
+ 
+ 	pr_info("PGD %lx ", pgd_val(*pgd));
+ 
+ 	if (!pgd_present(*pgd))
+ 		goto out;
+ 
+ 	p4d = p4d_offset(pgd, address);
+ 	if (bad_address(p4d))
+ 		goto bad;
+ 
+ 	pr_cont("P4D %lx ", p4d_val(*p4d));
+ 	if (!p4d_present(*p4d) || p4d_large(*p4d))
+ 		goto out;
+ 
+ 	pud = pud_offset(p4d, address);
+ 	if (bad_address(pud))
+ 		goto bad;
+ 
+ 	pr_cont("PUD %lx ", pud_val(*pud));
+ 	if (!pud_present(*pud) || pud_large(*pud))
+ 		goto out;
+ 
+ 	pmd = pmd_offset(pud, address);
+ 	if (bad_address(pmd))
+ 		goto bad;
+ 
+ 	pr_cont("PMD %lx ", pmd_val(*pmd));
+ 	if (!pmd_present(*pmd) || pmd_large(*pmd))
+ 		goto out;
+ 
+ 	pte = pte_offset_kernel(pmd, address);
+ 	if (bad_address(pte))
+ 		goto bad;
+ 
+ 	pr_cont("PTE %lx", pte_val(*pte));
+ out:
+ 	pr_cont("\n");
+ 	return;
+ bad:
+ 	pr_info("BAD\n");
+ }
+ 
+ #endif /* CONFIG_X86_64 */
+ 
+ /*
+  * Workaround for K8 erratum #93 & buggy BIOS.
+  *
+  * BIOS SMM functions are required to use a specific workaround
+  * to avoid corruption of the 64bit RIP register on C stepping K8.
+  *
+  * A lot of BIOS that didn't get tested properly miss this.
+  *
+  * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+  * Try to work around it here.
+  *
+  * Note we only handle faults in kernel here.
+  * Does nothing on 32-bit.
+  */
+ static int is_errata93(struct pt_regs *regs, unsigned long address)
+ {
+ #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+ 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+ 	    || boot_cpu_data.x86 != 0xf)
+ 		return 0;
+ 
+ 	if (address != regs->ip)
+ 		return 0;
+ 
+ 	if ((address >> 32) != 0)
+ 		return 0;
+ 
+ 	address |= 0xffffffffUL << 32;
+ 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
+ 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
+ 		printk_once(errata93_warning);
+ 		regs->ip = address;
+ 		return 1;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ /*
+  * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+  * to illegal addresses >4GB.
+  *
+  * We catch this in the page fault handler because these addresses
+  * are not reachable. Just detect this case and return.  Any code
+  * segment in LDT is compatibility mode.
+  */
+ static int is_errata100(struct pt_regs *regs, unsigned long address)
+ {
+ #ifdef CONFIG_X86_64
+ 	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
+ 		return 1;
+ #endif
+ 	return 0;
+ }
+ 
+ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+ {
+ #ifdef CONFIG_X86_F00F_BUG
+ 	unsigned long nr;
+ 
+ 	/*
+ 	 * Pentium F0 0F C7 C8 bug workaround:
+ 	 */
+ 	if (boot_cpu_has_bug(X86_BUG_F00F)) {
+ 		nr = (address - idt_descr.address) >> 3;
+ 
+ 		if (nr == 6) {
+ 			do_invalid_op(regs, 0);
+ 			return 1;
+ 		}
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
+ {
+ 	u32 offset = (index >> 3) * sizeof(struct desc_struct);
+ 	unsigned long addr;
+ 	struct ldttss_desc desc;
+ 
+ 	if (index == 0) {
+ 		pr_alert("%s: NULL\n", name);
+ 		return;
+ 	}
+ 
+ 	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
+ 		pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
+ 		return;
+ 	}
+ 
+ 	if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
+ 			      sizeof(struct ldttss_desc))) {
+ 		pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
+ 			 name, index);
+ 		return;
+ 	}
+ 
+ 	addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
+ #ifdef CONFIG_X86_64
+ 	addr |= ((u64)desc.base3 << 32);
+ #endif
+ 	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
+ 		 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
+ }
+ 
+ static void
+ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ {
+ 	if (!oops_may_print())
+ 		return;
+ 
+ 	if (error_code & X86_PF_INSTR) {
+ 		unsigned int level;
+ 		pgd_t *pgd;
+ 		pte_t *pte;
+ 
+ 		pgd = __va(read_cr3_pa());
+ 		pgd += pgd_index(address);
+ 
+ 		pte = lookup_address_in_pgd(pgd, address, &level);
+ 
+ 		if (pte && pte_present(*pte) && !pte_exec(*pte))
+ 			pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
+ 				from_kuid(&init_user_ns, current_uid()));
+ 		if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ 				(pgd_flags(*pgd) & _PAGE_USER) &&
+ 				(__read_cr4() & X86_CR4_SMEP))
+ 			pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
+ 				from_kuid(&init_user_ns, current_uid()));
+ 	}
+ 
+ 	if (address < PAGE_SIZE && !user_mode(regs))
+ 		pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
+ 			(void *)address);
+ 	else
+ 		pr_alert("BUG: unable to handle page fault for address: %px\n",
+ 			(void *)address);
+ 
+ 	pr_alert("#PF: %s %s in %s mode\n",
+ 		 (error_code & X86_PF_USER)  ? "user" : "supervisor",
+ 		 (error_code & X86_PF_INSTR) ? "instruction fetch" :
+ 		 (error_code & X86_PF_WRITE) ? "write access" :
+ 					       "read access",
+ 			     user_mode(regs) ? "user" : "kernel");
+ 	pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
+ 		 !(error_code & X86_PF_PROT) ? "not-present page" :
+ 		 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
+ 		 (error_code & X86_PF_PK)    ? "protection keys violation" :
+ 					       "permissions violation");
+ 
+ 	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+ 		struct desc_ptr idt, gdt;
+ 		u16 ldtr, tr;
+ 
+ 		/*
+ 		 * This can happen for quite a few reasons.  The more obvious
+ 		 * ones are faults accessing the GDT, or LDT.  Perhaps
+ 		 * surprisingly, if the CPU tries to deliver a benign or
+ 		 * contributory exception from user code and gets a page fault
+ 		 * during delivery, the page fault can be delivered as though
+ 		 * it originated directly from user code.  This could happen
+ 		 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
+ 		 * kernel or IST stack.
+ 		 */
+ 		store_idt(&idt);
+ 
+ 		/* Usable even on Xen PV -- it's just slow. */
+ 		native_store_gdt(&gdt);
+ 
+ 		pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
+ 			 idt.address, idt.size, gdt.address, gdt.size);
+ 
+ 		store_ldt(ldtr);
+ 		show_ldttss(&gdt, "LDTR", ldtr);
+ 
+ 		store_tr(tr);
+ 		show_ldttss(&gdt, "TR", tr);
+ 	}
+ 
+ 	dump_pagetable(address);
+ }
+ 
+ static noinline void
+ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+ 	    unsigned long address)
+ {
+ 	struct task_struct *tsk;
+ 	unsigned long flags;
+ 	int sig;
+ 
+ 	flags = oops_begin();
+ 	tsk = current;
+ 	sig = SIGKILL;
+ 
+ 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+ 	       tsk->comm, address);
+ 	dump_pagetable(address);
+ 
+ 	if (__die("Bad pagetable", regs, error_code))
+ 		sig = 0;
+ 
+ 	oops_end(flags, regs, sig);
+ }
+ 
+ static void set_signal_archinfo(unsigned long address,
+ 				unsigned long error_code)
+ {
+ 	struct task_struct *tsk = current;
+ 
+ 	/*
+ 	 * To avoid leaking information about the kernel page
+ 	 * table layout, pretend that user-mode accesses to
+ 	 * kernel addresses are always protection faults.
+ 	 *
+ 	 * NB: This means that failed vsyscalls with vsyscall=none
+ 	 * will have the PROT bit.  This doesn't leak any
+ 	 * information and does not appear to cause any problems.
+ 	 */
+ 	if (address >= TASK_SIZE_MAX)
+ 		error_code |= X86_PF_PROT;
+ 
+ 	tsk->thread.trap_nr = X86_TRAP_PF;
+ 	tsk->thread.error_code = error_code | X86_PF_USER;
+ 	tsk->thread.cr2 = address;
+ }
+ 
+ static noinline void
+ no_context(struct pt_regs *regs, unsigned long error_code,
+ 	   unsigned long address, int signal, int si_code)
+ {
+ 	struct task_struct *tsk = current;
+ 	unsigned long flags;
+ 	int sig;
+ 
+ 	if (user_mode(regs)) {
+ 		/*
+ 		 * This is an implicit supervisor-mode access from user
+ 		 * mode.  Bypass all the kernel-mode recovery code and just
+ 		 * OOPS.
+ 		 */
+ 		goto oops;
+ 	}
+ 
+ 	/* Are we prepared to handle this kernel fault? */
+ 	if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+ 		/*
+ 		 * Any interrupt that takes a fault gets the fixup. This makes
+ 		 * the below recursive fault logic only apply to a faults from
+ 		 * task context.
+ 		 */
+ 		if (in_interrupt())
+ 			return;
+ 
+ 		/*
+ 		 * Per the above we're !in_interrupt(), aka. task context.
+ 		 *
+ 		 * In this case we need to make sure we're not recursively
+ 		 * faulting through the emulate_vsyscall() logic.
+ 		 */
+ 		if (current->thread.sig_on_uaccess_err && signal) {
+ 			set_signal_archinfo(address, error_code);
+ 
+ 			/* XXX: hwpoison faults will set the wrong code. */
+ 			force_sig_fault(signal, si_code, (void __user *)address);
+ 		}
+ 
+ 		/*
+ 		 * Barring that, we can do the fixup and be happy.
+ 		 */
+ 		return;
+ 	}
+ 
+ #ifdef CONFIG_VMAP_STACK
+ 	/*
+ 	 * Stack overflow?  During boot, we can fault near the initial
+ 	 * stack in the direct map, but that's not an overflow -- check
+ 	 * that we're in vmalloc space to avoid this.
+ 	 */
+ 	if (is_vmalloc_addr((void *)address) &&
+ 	    (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+ 	     address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ 		unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
+ 		/*
+ 		 * We're likely to be running with very little stack space
+ 		 * left.  It's plausible that we'd hit this condition but
+ 		 * double-fault even before we get this far, in which case
+ 		 * we're fine: the double-fault handler will deal with it.
+ 		 *
+ 		 * We don't want to make it all the way into the oops code
+ 		 * and then double-fault, though, because we're likely to
+ 		 * break the console driver and lose most of the stack dump.
+ 		 */
+ 		asm volatile ("movq %[stack], %%rsp\n\t"
+ 			      "call handle_stack_overflow\n\t"
+ 			      "1: jmp 1b"
+ 			      : ASM_CALL_CONSTRAINT
+ 			      : "D" ("kernel stack overflow (page fault)"),
+ 				"S" (regs), "d" (address),
+ 				[stack] "rm" (stack));
+ 		unreachable();
+ 	}
+ #endif
+ 
+ 	/*
+ 	 * 32-bit:
+ 	 *
+ 	 *   Valid to do another page fault here, because if this fault
+ 	 *   had been triggered by is_prefetch fixup_exception would have
+ 	 *   handled it.
+ 	 *
+ 	 * 64-bit:
+ 	 *
+ 	 *   Hall of shame of CPU/BIOS bugs.
+ 	 */
+ 	if (is_prefetch(regs, error_code, address))
+ 		return;
+ 
+ 	if (is_errata93(regs, address))
+ 		return;
+ 
+ 	/*
+ 	 * Buggy firmware could access regions which might page fault, try to
+ 	 * recover from such faults.
+ 	 */
+ 	if (IS_ENABLED(CONFIG_EFI))
+ 		efi_recover_from_page_fault(address);
+ 
+ oops:
+ 	/*
+ 	 * Oops. The kernel tried to access some bad page. We'll have to
+ 	 * terminate things with extreme prejudice:
+ 	 */
+ 	flags = oops_begin();
+ 
+ 	show_fault_oops(regs, error_code, address);
+ 
+ 	if (task_stack_end_corrupted(tsk))
+ 		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+ 
+ 	sig = SIGKILL;
+ 	if (__die("Oops", regs, error_code))
+ 		sig = 0;
+ 
+ 	/* Executive summary in case the body of the oops scrolled away */
+ 	printk(KERN_DEFAULT "CR2: %016lx\n", address);
+ 
+ 	oops_end(flags, regs, sig);
+ }
+ 
+ /*
+  * Print out info about fatal segfaults, if the show_unhandled_signals
+  * sysctl is set:
+  */
+ static inline void
+ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ 		unsigned long address, struct task_struct *tsk)
+ {
+ 	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+ 
+ 	if (!unhandled_signal(tsk, SIGSEGV))
+ 		return;
+ 
+ 	if (!printk_ratelimit())
+ 		return;
+ 
+ 	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
+ 		loglvl, tsk->comm, task_pid_nr(tsk), address,
+ 		(void *)regs->ip, (void *)regs->sp, error_code);
+ 
+ 	print_vma_addr(KERN_CONT " in ", regs->ip);
+ 
+ 	printk(KERN_CONT "\n");
+ 
+ 	show_opcodes(regs, loglvl);
+ }
+ 
+ /*
+  * The (legacy) vsyscall page is the long page in the kernel portion
+  * of the address space that has user-accessible permissions.
+  */
+ static bool is_vsyscall_vaddr(unsigned long vaddr)
+ {
+ 	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
+ }
+ 
+ static void
+ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ 		       unsigned long address, u32 pkey, int si_code)
+ {
+ 	struct task_struct *tsk = current;
+ 
+ 	/* User mode accesses just cause a SIGSEGV */
+ 	if (user_mode(regs) && (error_code & X86_PF_USER)) {
+ 		/*
+ 		 * It's possible to have interrupts off here:
+ 		 */
+ 		local_irq_enable();
+ 
+ 		/*
+ 		 * Valid to do another page fault here because this one came
+ 		 * from user space:
+ 		 */
+ 		if (is_prefetch(regs, error_code, address))
+ 			return;
+ 
+ 		if (is_errata100(regs, address))
+ 			return;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 		if (pax_is_fetch_fault(regs, error_code, address)) {
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 			switch (pax_handle_fetch_fault(regs)) {
+ 			case 2:
+ 				return;
+ 			}
+ #endif
+ 
+ 			pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp);
+ 			do_group_exit(SIGKILL);
+ 		}
+ #endif
+ 
+ 		/*
+ 		 * To avoid leaking information about the kernel page table
+ 		 * layout, pretend that user-mode accesses to kernel addresses
+ 		 * are always protection faults.
+ 		 */
+ 		if (address >= TASK_SIZE_MAX)
+ 			error_code |= X86_PF_PROT;
+ 
+ 		if (likely(show_unhandled_signals))
+ 			show_signal_msg(regs, error_code, address, tsk);
+ 
+ 		set_signal_archinfo(address, error_code);
+ 
+ 		if (si_code == SEGV_PKUERR)
+ 			force_sig_pkuerr((void __user *)address, pkey);
+ 
+ 		force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ 
+ 		return;
+ 	}
+ 
+ 	if (is_f00f_bug(regs, address))
+ 		return;
+ 
+ 	no_context(regs, error_code, address, SIGSEGV, si_code);
+ }
+ 
+ static noinline void
+ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ 		     unsigned long address)
+ {
+ 	__bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
+ }
+ 
+ static void
+ __bad_area(struct pt_regs *regs, unsigned long error_code,
+ 	   unsigned long address, u32 pkey, int si_code)
+ {
+ 	struct mm_struct *mm = current->mm;
+ 	/*
+ 	 * Something tried to access memory that isn't in our memory map..
+ 	 * Fix it, but check if it's kernel or user first..
+ 	 */
+ 	up_read(&mm->mmap_sem);
+ 
+ 	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+ }
+ 
+ static noinline void
+ bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ {
+ 	__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
+ }
+ 
+ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ 		struct vm_area_struct *vma)
+ {
+ 	/* This code is always called on the current mm */
+ 	bool foreign = false;
+ 
+ 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ 		return false;
+ 	if (error_code & X86_PF_PK)
+ 		return true;
+ 	/* this checks permission keys on the VMA: */
+ 	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ 				       (error_code & X86_PF_INSTR), foreign))
+ 		return true;
+ 	return false;
+ }
+ 
+ static noinline void
+ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+ 		      unsigned long address, struct vm_area_struct *vma)
+ {
+ 	/*
+ 	 * This OSPKE check is not strictly necessary at runtime.
+ 	 * But, doing it this way allows compiler optimizations
+ 	 * if pkeys are compiled out.
+ 	 */
+ 	if (bad_area_access_from_pkeys(error_code, vma)) {
+ 		/*
+ 		 * A protection key fault means that the PKRU value did not allow
+ 		 * access to some PTE.  Userspace can figure out what PKRU was
+ 		 * from the XSAVE state.  This function captures the pkey from
+ 		 * the vma and passes it to userspace so userspace can discover
+ 		 * which protection key was set on the PTE.
+ 		 *
+ 		 * If we get here, we know that the hardware signaled a X86_PF_PK
+ 		 * fault and that there was a VMA once we got in the fault
+ 		 * handler.  It does *not* guarantee that the VMA we find here
+ 		 * was the one that we faulted on.
+ 		 *
+ 		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ 		 * 2. T1   : set PKRU to deny access to pkey=4, touches page
+ 		 * 3. T1   : faults...
+ 		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ 		 * 5. T1   : enters fault handler, takes mmap_sem, etc...
+ 		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
+ 		 *	     faulted on a pte with its pkey=4.
+ 		 */
+ 		u32 pkey = vma_pkey(vma);
+ 
+ 		__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
+ 	} else {
+ 		__bad_area(regs, error_code, address, 0, SEGV_ACCERR);
+ 	}
+ }
+ 
+ static void
+ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+ 	  vm_fault_t fault)
+ {
+ 	/* Kernel mode? Handle exceptions or die: */
+ 	if (!(error_code & X86_PF_USER)) {
+ 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ 		return;
+ 	}
+ 
+ 	/* User-space => ok to do another page fault: */
+ 	if (is_prefetch(regs, error_code, address))
+ 		return;
+ 
+ 	set_signal_archinfo(address, error_code);
+ 
+ #ifdef CONFIG_MEMORY_FAILURE
+ 	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ 		struct task_struct *tsk = current;
+ 		unsigned lsb = 0;
+ 
+ 		pr_err(
+ 	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+ 			tsk->comm, tsk->pid, address);
+ 		if (fault & VM_FAULT_HWPOISON_LARGE)
+ 			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ 		if (fault & VM_FAULT_HWPOISON)
+ 			lsb = PAGE_SHIFT;
+ 		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
+ 		return;
+ 	}
+ #endif
+ 	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
+ }
+ 
+ static noinline void
+ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+ 	       unsigned long address, vm_fault_t fault)
+ {
+ 	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
+ 		no_context(regs, error_code, address, 0, 0);
+ 		return;
+ 	}
+ 
+ 	if (fault & VM_FAULT_OOM) {
+ 		/* Kernel mode? Handle exceptions or die: */
+ 		if (!(error_code & X86_PF_USER)) {
+ 			no_context(regs, error_code, address,
+ 				   SIGSEGV, SEGV_MAPERR);
+ 			return;
+ 		}
+ 
+ 		/*
+ 		 * We ran out of memory, call the OOM killer, and return the
+ 		 * userspace (which will retry the fault, or kill us if we got
+ 		 * oom-killed):
+ 		 */
+ 		pagefault_out_of_memory();
+ 	} else {
+ 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ 			     VM_FAULT_HWPOISON_LARGE))
+ 			do_sigbus(regs, error_code, address, fault);
+ 		else if (fault & VM_FAULT_SIGSEGV)
+ 			bad_area_nosemaphore(regs, error_code, address);
+ 		else
+ 			BUG();
+ 	}
+ }
+ 
+ static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
+ {
+ 	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
+ 		return 0;
+ 
+ 	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
+ 		return 0;
+ 
+ 	return 1;
+ }
+ 
+ #if defined(CONFIG_X86_32) && defined(CONFIG_MINISEC_PAGEEXEC)
+ static inline unsigned long get_limit(unsigned long segment)
+ {
+ 	unsigned long __limit;
+ 
+ 	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+ 	return __limit + 1;
+ }
+ 
+ static int pax_handle_pageexec_fault(struct pt_regs *regs, struct mm_struct *mm, unsigned long address, unsigned long error_code)
+ {
+ 	pte_t *pte;
+ 	pmd_t *pmd;
+ 	spinlock_t *ptl;
+ 	unsigned char pte_mask;
+ 
+ 	if ((__supported_pte_mask & _PAGE_NX) || (error_code & (PF_PROT|PF_USER)) != (PF_PROT|PF_USER) || v8086_mode(regs) ||
+ 	    !(mm->pax_flags & MF_PAX_PAGEEXEC))
+ 		return 0;
+ 
+ 	/* PaX: it's our fault, let's handle it if we can */
+ 
+ 	/* PaX: take a look at read faults before acquiring any locks */
+ 	if (unlikely(!(error_code & PF_WRITE) && (regs->ip == address))) {
+ 		/* instruction fetch attempt from a protected page in user mode */
+ 		up_read(&mm->mmap_sem);
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 		switch (pax_handle_fetch_fault(regs)) {
+ 		case 2:
+ 			return 1;
+ 		}
+ #endif
+ 
+ 		pax_report_fault(regs, (void *)regs->ip, (void *)regs->sp);
+ 		do_group_exit(SIGKILL);
+ 	}
+ 
+ 	pmd = pax_get_pmd(mm, address);
+ 	if (unlikely(!pmd))
+ 		return 0;
+ 
+ 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ 	if (unlikely(!(pte_val(*pte) & _PAGE_PRESENT) || pte_user(*pte))) {
+ 		pte_unmap_unlock(pte, ptl);
+ 		return 0;
+ 	}
+ 
+ 	if (unlikely((error_code & PF_WRITE) && !pte_write(*pte))) {
+ 		/* write attempt to a protected page in user mode */
+ 		pte_unmap_unlock(pte, ptl);
+ 		return 0;
+ 	}
+ 
+ #ifdef CONFIG_SMP
+ 	if (likely(address > get_limit(regs->cs) && cpumask_test_cpu(smp_processor_id(), &mm->context.cpu_user_cs_mask)))
+ #else
+ 	if (likely(address > get_limit(regs->cs)))
+ #endif
+ 	{
+ 		set_pte(pte, pte_mkread(*pte));
+ 		__flush_tlb_one(address);
+ 		pte_unmap_unlock(pte, ptl);
+ 		up_read(&mm->mmap_sem);
+ 		return 1;
+ 	}
+ 
+ 	pte_mask = _PAGE_ACCESSED | _PAGE_USER | ((error_code & PF_WRITE) << (_PAGE_BIT_DIRTY-1));
+ 
+ 	/*
+ 	 * PaX: fill DTLB with user rights and retry
+ 	 */
+ 	__asm__ __volatile__ (
+ 		"orb %2,(%1)\n"
+ #if defined(CONFIG_M586) || defined(CONFIG_M586TSC)
+ /*
+  * PaX: let this uncommented 'invlpg' remind us on the behaviour of Intel's
+  * (and AMD's) TLBs. namely, they do not cache PTEs that would raise *any*
+  * page fault when examined during a TLB load attempt. this is true not only
+  * for PTEs holding a non-present entry but also present entries that will
+  * raise a page fault (such as those set up by PaX, or the copy-on-write
+  * mechanism). in effect it means that we do *not* need to flush the TLBs
+  * for our target pages since their PTEs are simply not in the TLBs at all.
+  * the best thing in omitting it is that we gain around 15-20% speed in the
+  * fast path of the page fault handler and can get rid of tracing since we
+  * can no longer flush unintended entries.
+  */
+ 		"invlpg (%0)\n"
+ #endif
+ 		ASM_STAC "\n"
+ 		__copyuser_seg"testb $0,(%0)\n"
+ 		ASM_CLAC "\n"
+ 		"xorb %3,(%1)\n"
+ 		:
+ 		: "r" (address), "r" (pte), "q" (pte_mask), "i" (_PAGE_USER)
+ 		: "memory", "cc");
+ 	pte_unmap_unlock(pte, ptl);
+ 	up_read(&mm->mmap_sem);
+ 	return 1;
+ }
+ #endif
+ 
+ /*
+  * Handle a spurious fault caused by a stale TLB entry.
+  *
+  * This allows us to lazily refresh the TLB when increasing the
+  * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+  * eagerly is very expensive since that implies doing a full
+  * cross-processor TLB flush, even if no stale TLB entries exist
+  * on other processors.
+  *
+  * Spurious faults may only occur if the TLB contains an entry with
+  * fewer permission than the page table entry.  Non-present (P = 0)
+  * and reserved bit (R = 1) faults are never spurious.
+  *
+  * There are no security implications to leaving a stale TLB when
+  * increasing the permissions on a page.
+  *
+  * Returns non-zero if a spurious fault was handled, zero otherwise.
+  *
+  * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+  * (Optional Invalidation).
+  */
+ static noinline int
+ spurious_kernel_fault(unsigned long error_code, unsigned long address)
+ {
+ 	pgd_t *pgd;
+ 	p4d_t *p4d;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+ 	int ret;
+ 
+ 	/*
+ 	 * Only writes to RO or instruction fetches from NX may cause
+ 	 * spurious faults.
+ 	 *
+ 	 * These could be from user or supervisor accesses but the TLB
+ 	 * is only lazily flushed after a kernel mapping protection
+ 	 * change, so user accesses are not expected to cause spurious
+ 	 * faults.
+ 	 */
+ 	if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+ 	    error_code != (X86_PF_INSTR | X86_PF_PROT))
+ 		return 0;
+ 
+ 	pgd = init_mm.pgd + pgd_index(address);
+ 	if (!pgd_present(*pgd))
+ 		return 0;
+ 
+ 	p4d = p4d_offset(pgd, address);
+ 	if (!p4d_present(*p4d))
+ 		return 0;
+ 
+ 	if (p4d_large(*p4d))
+ 		return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
+ 
+ 	pud = pud_offset(p4d, address);
+ 	if (!pud_present(*pud))
+ 		return 0;
+ 
+ 	if (pud_large(*pud))
+ 		return spurious_kernel_fault_check(error_code, (pte_t *) pud);
+ 
+ 	pmd = pmd_offset(pud, address);
+ 	if (!pmd_present(*pmd))
+ 		return 0;
+ 
+ 	if (pmd_large(*pmd))
+ 		return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
+ 
+ 	pte = pte_offset_kernel(pmd, address);
+ 	if (!pte_present(*pte))
+ 		return 0;
+ 
+ 	ret = spurious_kernel_fault_check(error_code, pte);
+ 	if (!ret)
+ 		return 0;
+ 
+ 	/*
+ 	 * Make sure we have permissions in PMD.
+ 	 * If not, then there's a bug in the page tables:
+ 	 */
+ 	ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
+ 	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+ 
+ 	return ret;
+ }
+ NOKPROBE_SYMBOL(spurious_kernel_fault);
+ 
+ int show_unhandled_signals = 1;
+ 
+ static inline int
+ access_error(unsigned long error_code, struct vm_area_struct *vma)
+ {
+ 	/* This is only called for the current mm, so: */
+ 	bool foreign = false;
+ 
+ 	/*
+ 	 * Read or write was blocked by protection keys.  This is
+ 	 * always an unconditional error and can never result in
+ 	 * a follow-up action to resolve the fault, like a COW.
+ 	 */
+ 	if (error_code & X86_PF_PK)
+ 		return 1;
+ 
+ 	/*
+ 	 * Make sure to check the VMA so that we do not perform
+ 	 * faults just to hit a X86_PF_PK as soon as we fill in a
+ 	 * page.
+ 	 */
+ 	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ 				       (error_code & X86_PF_INSTR), foreign))
+ 		return 1;
+ 
+ 	if (error_code & X86_PF_WRITE) {
+ 		/* write, present and write, not present: */
+ 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ 			return 1;
+ 		return 0;
+ 	}
+ 
+ 	/* read, present: */
+ 	if (unlikely(error_code & X86_PF_PROT))
+ 		return 1;
+ 
+ 	/* read, not present: */
+ 	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ 		return 1;
+ 
+ 	return 0;
+ }
+ 
+ static int fault_in_kernel_space(unsigned long address)
+ {
+ 	/*
+ 	 * On 64-bit systems, the vsyscall page is at an address above
+ 	 * TASK_SIZE_MAX, but is not considered part of the kernel
+ 	 * address space.
+ 	 */
+ 	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
+ 		return false;
+ 
+ 	return address >= TASK_SIZE_MAX;
+ }
+ 
+ /*
+  * Called for all faults where 'address' is part of the kernel address
+  * space.  Might get called for faults that originate from *code* that
+  * ran in userspace or the kernel.
+  */
+ static void
+ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ 		   unsigned long address)
+ {
+ 	/*
+ 	 * Protection keys exceptions only happen on user pages.  We
+ 	 * have no user pages in the kernel portion of the address
+ 	 * space, so do not expect them here.
+ 	 */
+ 	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
+ 
+ 	/*
+ 	 * We can fault-in kernel-space virtual memory on-demand. The
+ 	 * 'reference' page table is init_mm.pgd.
+ 	 *
+ 	 * NOTE! We MUST NOT take any locks for this case. We may
+ 	 * be in an interrupt or a critical region, and should
+ 	 * only copy the information from the master page table,
+ 	 * nothing more.
+ 	 *
+ 	 * Before doing this on-demand faulting, ensure that the
+ 	 * fault is not any of the following:
+ 	 * 1. A fault on a PTE with a reserved bit set.
+ 	 * 2. A fault caused by a user-mode access.  (Do not demand-
+ 	 *    fault kernel memory due to user-mode accesses).
+ 	 * 3. A fault caused by a page-level protection violation.
+ 	 *    (A demand fault would be on a non-present page which
+ 	 *     would have X86_PF_PROT==0).
+ 	 */
+ 	if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+ 		if (vmalloc_fault(address) >= 0)
+ 			return;
+ 	}
+ 
+ 	/* Was the fault spurious, caused by lazy TLB invalidation? */
+ 	if (spurious_kernel_fault(hw_error_code, address))
+ 		return;
+ 
+ 	/* kprobes don't want to hook the spurious faults: */
+ 	if (kprobe_page_fault(regs, X86_TRAP_PF))
+ 		return;
+ 
+ 	/*
+ 	 * Note, despite being a "bad area", there are quite a few
+ 	 * acceptable reasons to get here, such as erratum fixups
+ 	 * and handling kernel code that can fault, like get_user().
+ 	 *
+ 	 * Don't take the mm semaphore here. If we fixup a prefetch
+ 	 * fault we could otherwise deadlock:
+ 	 */
+ 	bad_area_nosemaphore(regs, hw_error_code, address);
+ }
+ NOKPROBE_SYMBOL(do_kern_addr_fault);
+ 
+ /* Handle faults in the user portion of the address space */
+ static inline
+ void do_user_addr_fault(struct pt_regs *regs,
+ 			unsigned long hw_error_code,
+ 			unsigned long address)
+ {
+ 	struct vm_area_struct *vma;
+ 	struct task_struct *tsk;
+ 	struct mm_struct *mm;
+ 	vm_fault_t fault, major = 0;
+ 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ 
+ 	tsk = current;
+ 	mm = tsk->mm;
+ 
+ 	/* kprobes don't want to hook the spurious faults: */
+ 	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
+ 		return;
+ 
+ 	/*
+ 	 * Reserved bits are never expected to be set on
+ 	 * entries in the user portion of the page tables.
+ 	 */
+ 	if (unlikely(hw_error_code & X86_PF_RSVD))
+ 		pgtable_bad(regs, hw_error_code, address);
+ 
+ 	/*
+ 	 * If SMAP is on, check for invalid kernel (supervisor) access to user
+ 	 * pages in the user address space.  The odd case here is WRUSS,
+ 	 * which, according to the preliminary documentation, does not respect
+ 	 * SMAP and will have the USER bit set so, in all cases, SMAP
+ 	 * enforcement appears to be consistent with the USER bit.
+ 	 */
+ 	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
+ 		     !(hw_error_code & X86_PF_USER) &&
+ 		     !(regs->flags & X86_EFLAGS_AC)))
+ 	{
+ 		bad_area_nosemaphore(regs, hw_error_code, address);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * If we're in an interrupt, have no user context or are running
+ 	 * in a region with pagefaults disabled then we must not take the fault
+ 	 */
+ 	if (unlikely(faulthandler_disabled() || !mm)) {
+ 		bad_area_nosemaphore(regs, hw_error_code, address);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * It's safe to allow irq's after cr2 has been saved and the
+ 	 * vmalloc fault has been handled.
+ 	 *
+ 	 * User-mode registers count as a user access even for any
+ 	 * potential system fault or CPU buglet:
+ 	 */
+ 	if (user_mode(regs)) {
+ 		local_irq_enable();
+ 		flags |= FAULT_FLAG_USER;
+ 	} else {
+ 		if (regs->flags & X86_EFLAGS_IF)
+ 			local_irq_enable();
+ 	}
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ 
+ 	if (hw_error_code & X86_PF_WRITE)
+ 		flags |= FAULT_FLAG_WRITE;
+ 	if (hw_error_code & X86_PF_INSTR)
+ 		flags |= FAULT_FLAG_INSTRUCTION;
+ 
+ #ifdef CONFIG_X86_64
+ 	/*
+ 	 * Faults in the vsyscall page might need emulation.  The
+ 	 * vsyscall page is at a high address (>PAGE_OFFSET), but is
+ 	 * considered to be part of the user address space.
+ 	 *
+ 	 * The vsyscall page does not have a "real" VMA, so do this
+ 	 * emulation before we go searching for VMAs.
+ 	 *
+ 	 * PKRU never rejects instruction fetches, so we don't need
+ 	 * to consider the PF_PK bit.
+ 	 */
+ 	if (is_vsyscall_vaddr(address)) {
+ 		if (emulate_vsyscall(hw_error_code, regs, address))
+ 			return;
+ 	}
+ #endif
+ 
+ 	/*
+ 	 * Kernel-mode access to the user address space should only occur
+ 	 * on well-defined single instructions listed in the exception
+ 	 * tables.  But, an erroneous kernel fault occurring outside one of
+ 	 * those areas which also holds mmap_sem might deadlock attempting
+ 	 * to validate the fault against the address space.
+ 	 *
+ 	 * Only do the expensive exception table search when we might be at
+ 	 * risk of a deadlock.  This happens if we
+ 	 * 1. Failed to acquire mmap_sem, and
+ 	 * 2. The access did not originate in userspace.
+ 	 */
+ 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ 		if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
+ 			/*
+ 			 * Fault from code in kernel from
+ 			 * which we do not expect faults.
+ 			 */
+ 			bad_area_nosemaphore(regs, hw_error_code, address);
+ 			return;
+ 		}
+ retry:
+ 		down_read(&mm->mmap_sem);
+ 	} else {
+ 		/*
+ 		 * The above down_read_trylock() might have succeeded in
+ 		 * which case we'll have missed the might_sleep() from
+ 		 * down_read():
+ 		 */
+ 		might_sleep();
+ 	}
+ 
+ #if defined(CONFIG_X86_32) && defined(CONFIG_MINISEC_PAGEEXEC)
+ 	if (pax_handle_pageexec_fault(regs, mm, address, error_code))
+ 		return;
+ #endif
+ 
+ 	vma = find_vma(mm, address);
+ 	if (unlikely(!vma)) {
+ 		bad_area(regs, hw_error_code, address);
+ 		return;
+ 	}
+ 	if (likely(vma->vm_start <= address))
+ 		goto good_area;
+ 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ 		bad_area(regs, hw_error_code, address);
+ 		return;
+ 	}
+ 	if (unlikely(expand_stack(vma, address))) {
+ 		bad_area(regs, hw_error_code, address);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Ok, we have a good vm_area for this memory access, so
+ 	 * we can handle it..
+ 	 */
+ good_area:
+ 	if (unlikely(access_error(hw_error_code, vma))) {
+ 		bad_area_access_error(regs, hw_error_code, address, vma);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * If for any reason at all we couldn't handle the fault,
+ 	 * make sure we exit gracefully rather than endlessly redo
+ 	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ 	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ 	 *
+ 	 * Note that handle_userfault() may also release and reacquire mmap_sem
+ 	 * (and not return with VM_FAULT_RETRY), when returning to userland to
+ 	 * repeat the page fault later with a VM_FAULT_NOPAGE retval
+ 	 * (potentially after handling any pending signal during the return to
+ 	 * userland). The return to userland is identified whenever
+ 	 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
+ 	 */
+ 	fault = handle_mm_fault(vma, address, flags);
+ 	major |= fault & VM_FAULT_MAJOR;
+ 
+ 	/*
+ 	 * If we need to retry the mmap_sem has already been released,
+ 	 * and if there is a fatal signal pending there is no guarantee
+ 	 * that we made any progress. Handle this case first.
+ 	 */
+ 	if (unlikely(fault & VM_FAULT_RETRY)) {
+ 		/* Retry at most once */
+ 		if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ 			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ 			flags |= FAULT_FLAG_TRIED;
+ 			if (!fatal_signal_pending(tsk))
+ 				goto retry;
+ 		}
+ 
+ 		/* User mode? Just return to handle the fatal exception */
+ 		if (flags & FAULT_FLAG_USER)
+ 			return;
+ 
+ 		/* Not returning to user mode? Handle exceptions or die: */
+ 		no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
+ 		return;
+ 	}
+ 
+ 	up_read(&mm->mmap_sem);
+ 	if (unlikely(fault & VM_FAULT_ERROR)) {
+ 		mm_fault_error(regs, hw_error_code, address, fault);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Major/minor page fault accounting. If any of the events
+ 	 * returned VM_FAULT_MAJOR, we account it as a major fault.
+ 	 */
+ 	if (major) {
+ 		tsk->maj_flt++;
+ 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ 	} else {
+ 		tsk->min_flt++;
+ 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+ 	}
+ 
+ 	check_v8086_mode(regs, address, tsk);
+ }
+ NOKPROBE_SYMBOL(do_user_addr_fault);
+ 
+ /*
+  * Explicitly marked noinline such that the function tracer sees this as the
+  * page_fault entry point.
+  */
+ static noinline void
+ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ 		unsigned long address)
+ {
+ 	prefetchw(&current->mm->mmap_sem);
+ 
+ 	if (unlikely(kmmio_fault(regs, address)))
+ 		return;
+ 
+ 	/* Was the fault on kernel-controlled part of the address space? */
+ 	if (unlikely(fault_in_kernel_space(address)))
+ 		do_kern_addr_fault(regs, hw_error_code, address);
+ 	else
+ 		do_user_addr_fault(regs, hw_error_code, address);
+ }
+ NOKPROBE_SYMBOL(__do_page_fault);
+ 
+ static __always_inline void
+ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
+ 			 unsigned long address)
+ {
+ 	if (!trace_pagefault_enabled())
+ 		return;
+ 
+ 	if (user_mode(regs))
+ 		trace_page_fault_user(address, regs, error_code);
+ 	else
+ 		trace_page_fault_kernel(address, regs, error_code);
+ }
+ 
+ dotraplinkage void
+ do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ {
+ 	enum ctx_state prev_state;
+ 
+ 	prev_state = exception_enter();
+ 	trace_page_fault_entries(regs, error_code, address);
+ 	__do_page_fault(regs, error_code, address);
+ 	exception_exit(prev_state);
+ }
+ NOKPROBE_SYMBOL(do_page_fault);
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ static bool pax_is_fetch_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ {
+ 	struct mm_struct *mm = current->mm;
+ 	unsigned long ip = regs->ip;
+ 
+ 	if (v8086_mode(regs))
+ 		ip = ((regs->cs & 0xffff) << 4) + (ip & 0xffff);
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (mm->pax_flags & MF_PAX_PAGEEXEC) {
+ 		if ((__supported_pte_mask & _PAGE_NX) && (error_code & X86_PF_INSTR))
+ 			return true;
+ 		if (!(error_code & (X86_PF_PROT | X86_PF_WRITE)) && ip == address)
+ 			return true;
+ 		return false;
+ 	}
+ #endif
+ 
+ 	return false;
+ }
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ static int pax_handle_fetch_fault_32(struct pt_regs *regs)
+ {
+ 	int err;
+ 
+ 	do { /* PaX: libffi trampoline emulation */
+ 		unsigned char mov, jmp;
+ 		unsigned int addr1, addr2;
+ 
+ #ifdef CONFIG_X86_64
+ 		if ((regs->ip + 9) >> 32)
+ 			break;
+ #endif
+ 
+ 		err = get_user(mov, (unsigned char __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1));
+ 		err |= get_user(jmp, (unsigned char __user *)(regs->ip + 5));
+ 		err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov == 0xB8 && jmp == 0xE9) {
+ 			regs->ax = addr1;
+ 			regs->ip = (unsigned int)(regs->ip + addr2 + 10);
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #1 */
+ 		unsigned char mov1, mov2;
+ 		unsigned short jmp;
+ 		unsigned int addr1, addr2;
+ 
+ #ifdef CONFIG_X86_64
+ 		if ((regs->ip + 11) >> 32)
+ 			break;
+ #endif
+ 
+ 		err = get_user(mov1, (unsigned char __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1));
+ 		err |= get_user(mov2, (unsigned char __user *)(regs->ip + 5));
+ 		err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6));
+ 		err |= get_user(jmp, (unsigned short __user *)(regs->ip + 10));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xB9 && mov2 == 0xB8 && jmp == 0xE0FF) {
+ 			regs->cx = addr1;
+ 			regs->ax = addr2;
+ 			regs->ip = addr2;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #2 */
+ 		unsigned char mov, jmp;
+ 		unsigned int addr1, addr2;
+ 
+ #ifdef CONFIG_X86_64
+ 		if ((regs->ip + 9) >> 32)
+ 			break;
+ #endif
+ 
+ 		err = get_user(mov, (unsigned char __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 1));
+ 		err |= get_user(jmp, (unsigned char __user *)(regs->ip + 5));
+ 		err |= get_user(addr2, (unsigned int __user *)(regs->ip + 6));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov == 0xB9 && jmp == 0xE9) {
+ 			regs->cx = addr1;
+ 			regs->ip = (unsigned int)(regs->ip + addr2 + 10);
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	return 1; /* PaX in action */
+ }
+ 
+ #ifdef CONFIG_X86_64
+ static int pax_handle_fetch_fault_64(struct pt_regs *regs)
+ {
+ 	int err;
+ 
+ 	do { /* PaX: libffi trampoline emulation */
+ 		unsigned short mov1, mov2, jmp1;
+ 		unsigned char stcclc, jmp2;
+ 		unsigned long addr1, addr2;
+ 
+ 		err = get_user(mov1, (unsigned short __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned long __user *)(regs->ip + 2));
+ 		err |= get_user(mov2, (unsigned short __user *)(regs->ip + 10));
+ 		err |= get_user(addr2, (unsigned long __user *)(regs->ip + 12));
+ 		err |= get_user(stcclc, (unsigned char __user *)(regs->ip + 20));
+ 		err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 21));
+ 		err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 23));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xBB49 && mov2 == 0xBA49 && (stcclc == 0xF8 || stcclc == 0xF9) && jmp1 == 0xFF49 && jmp2 == 0xE3) {
+ 			regs->r11 = addr1;
+ 			regs->r10 = addr2;
+ 			if (stcclc == 0xF8)
+ 				regs->flags &= ~X86_EFLAGS_CF;
+ 			else
+ 				regs->flags |= X86_EFLAGS_CF;
+ 			regs->ip = addr1;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #1 */
+ 		unsigned short mov1, mov2, jmp1;
+ 		unsigned char jmp2;
+ 		unsigned int addr1;
+ 		unsigned long addr2;
+ 
+ 		err = get_user(mov1, (unsigned short __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned int __user *)(regs->ip + 2));
+ 		err |= get_user(mov2, (unsigned short __user *)(regs->ip + 6));
+ 		err |= get_user(addr2, (unsigned long __user *)(regs->ip + 8));
+ 		err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 16));
+ 		err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 18));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xBB41 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) {
+ 			regs->r11 = addr1;
+ 			regs->r10 = addr2;
+ 			regs->ip = addr1;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	do { /* PaX: gcc trampoline emulation #2 */
+ 		unsigned short mov1, mov2, jmp1;
+ 		unsigned char jmp2;
+ 		unsigned long addr1, addr2;
+ 
+ 		err = get_user(mov1, (unsigned short __user *)regs->ip);
+ 		err |= get_user(addr1, (unsigned long __user *)(regs->ip + 2));
+ 		err |= get_user(mov2, (unsigned short __user *)(regs->ip + 10));
+ 		err |= get_user(addr2, (unsigned long __user *)(regs->ip + 12));
+ 		err |= get_user(jmp1, (unsigned short __user *)(regs->ip + 20));
+ 		err |= get_user(jmp2, (unsigned char __user *)(regs->ip + 22));
+ 
+ 		if (err)
+ 			break;
+ 
+ 		if (mov1 == 0xBB49 && mov2 == 0xBA49 && jmp1 == 0xFF49 && jmp2 == 0xE3) {
+ 			regs->r11 = addr1;
+ 			regs->r10 = addr2;
+ 			regs->ip = addr1;
+ 			return 2;
+ 		}
+ 	} while (0);
+ 
+ 	return 1; /* PaX in action */
+ }
+ #endif
+ 
+ /*
+  * PaX: decide what to do with offenders (regs->ip = fault address)
+  *
+  * returns 1 when task should be killed
+  *         2 when gcc trampoline was detected
+  */
+ static int pax_handle_fetch_fault(struct pt_regs *regs)
+ {
+ 	if (v8086_mode(regs))
+ 		return 1;
+ 
+ 	if (!(current->mm->pax_flags & MF_PAX_EMUTRAMP))
+ 		return 1;
+ 
+ #ifdef CONFIG_X86_32
+ 	return pax_handle_fetch_fault_32(regs);
+ #else
+ 	if (regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT))
+ 		return pax_handle_fetch_fault_32(regs);
+ 	else
+ 		return pax_handle_fetch_fault_64(regs);
+ #endif
+ }
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ void pax_report_insns(struct pt_regs *regs, void *pc, void *sp)
+ {
+ 	long i;
+ 
+ 	printk(KERN_ERR "PAX: bytes at PC: ");
+ 	for (i = 0; i < 20; i++) {
+ 		unsigned char c;
+ 		if (get_user(c, (unsigned char *)pc+i))
+ 			printk(KERN_CONT "?? ");
+ 		else
+ 			printk(KERN_CONT "%02x ", c);
+ 	}
+ 	printk("\n");
+ 
+ 	printk(KERN_ERR "PAX: bytes at SP-%lu: ", (unsigned long)sizeof(long));
+ 	for (i = -1; i < 80 / (long)sizeof(long); i++) {
+ 		unsigned long c;
+ 		if (get_user(c, (unsigned long *)sp+i)) {
+ #ifdef CONFIG_X86_32
+ 			printk(KERN_CONT "???????? ");
+ #else
+ 			if ((regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT)))
+ 				printk(KERN_CONT "???????? ???????? ");
+ 			else
+ 				printk(KERN_CONT "???????????????? ");
+ #endif
+ 		} else {
+ #ifdef CONFIG_X86_64
+ 			if ((regs->cs == __USER32_CS || (regs->cs & SEGMENT_LDT))) {
+ 				printk(KERN_CONT "%08x ", (unsigned int)c);
+ 				printk(KERN_CONT "%08x ", (unsigned int)(c >> 32));
+ 			} else
+ #endif
+ 				printk(KERN_CONT "%0*lx ", 2 * (int)sizeof(long), c);
+ 		}
+ 	}
+ 	printk("\n");
+ }
+ #endif
diff --color -rcNP Master/arch/x86/mm/fault.c.rej OG/arch/x86/mm/fault.c.rej
*** Master/arch/x86/mm/fault.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/arch/x86/mm/fault.c.rej	2021-04-20 15:11:27.310000000 -0400
***************
*** 0 ****
--- 1,19 ----
+ *** arch/x86/mm/fault.c	2021-03-13 13:37:22.000000000 +0200
+ --- arch/x86/mm/fault.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 420,426 ****
+   
+   #ifdef CONFIG_CPU_SUP_AMD
+   static const char errata93_warning[] =
+ ! KERN_ERR
+   "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+   "******* Working around it, but it may cause SEGVs or burn power.\n"
+   "******* Please consider a BIOS update.\n"
+ --- 412,418 ----
+   
+   #ifdef CONFIG_CPU_SUP_AMD
+   static const char errata93_warning[] =
+ ! KERN_ERR
+   "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+   "******* Working around it, but it may cause SEGVs or burn power.\n"
+   "******* Please consider a BIOS update.\n"
diff --color -rcNP Master/arch/x86/mm/setup_nx.c OG/arch/x86/mm/setup_nx.c
*** Master/arch/x86/mm/setup_nx.c	2021-04-20 14:17:26.000000000 -0400
--- OG/arch/x86/mm/setup_nx.c	2021-04-20 15:11:34.500000000 -0400
***************
*** 7,14 ****
--- 7,16 ----
  #include <asm/proto.h>
  #include <asm/cpufeature.h>
  
+ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  static int disable_nx;
  
+ #ifndef CONFIG_MINISEC_PAGEEXEC
  /*
   * noexec = on|off
   *
***************
*** 30,35 ****
--- 32,40 ----
  	return 0;
  }
  early_param("noexec", noexec_setup);
+ #endif
+ 
+ #endif
  
  void x86_configure_nx(void)
  {
diff --color -rcNP Master/drivers/pci/proc.c OG/drivers/pci/proc.c
*** Master/drivers/pci/proc.c	2021-04-20 14:17:30.000000000 -0400
--- OG/drivers/pci/proc.c	2021-04-20 15:11:34.504000000 -0400
***************
*** 449,455 ****
--- 449,463 ----
  static int __init pci_proc_init(void)
  {
  	struct pci_dev *dev = NULL;
+ #ifdef CONFIG_MINISEC_PROC_ADD
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	proc_bus_pci_dir = proc_mkdir_mode("bus/pci", S_IRUSR | S_IXUSR, NULL);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	proc_bus_pci_dir = proc_mkdir_mode("bus/pci", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL);
+ #endif
+ #else
  	proc_bus_pci_dir = proc_mkdir("bus/pci", NULL);
+ #endif
  	proc_create_seq("devices", 0, proc_bus_pci_dir,
  		    &proc_bus_pci_devices_op);
  	proc_initialized = 1;
diff --color -rcNP Master/drivers/tty/tty_io.c OG/drivers/tty/tty_io.c
*** Master/drivers/tty/tty_io.c	2021-04-20 14:17:30.000000000 -0400
--- OG/drivers/tty/tty_io.c	2021-04-20 15:11:34.505000000 -0400
***************
*** 108,113 ****
--- 108,115 ----
  #include <linux/kmod.h>
  #include <linux/nsproxy.h>
  
+ #include <linux/minisec.h>
+ 
  #undef TTY_DEBUG_HANGUP
  #ifdef TTY_DEBUG_HANGUP
  # define tty_debug_hangup(tty, f, args...)	tty_debug(tty, f, ##args)
***************
*** 2194,2199 ****
--- 2196,2203 ----
  			"Denied TIOCSTI ioctl for non-privileged process\n");
  		return -EPERM;
  	}
+ 	if (gr_handle_tiocsti(tty))
+ 		return -EPERM;
  	if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN))
  		return -EPERM;
  	if (get_user(ch, p))
***************
*** 2875,2881 ****
  		return 0;
  	return file_tty(file) != t ? 0 : fd + 1;
  }
! 	
  /*
   * This implements the "Secure Attention Key" ---  the idea is to
   * prevent trojan horses by killing all processes associated with this
--- 2879,2885 ----
  		return 0;
  	return file_tty(file) != t ? 0 : fd + 1;
  }
! 
  /*
   * This implements the "Secure Attention Key" ---  the idea is to
   * prevent trojan horses by killing all processes associated with this
***************
*** 3520,3523 ****
  #endif
  	return 0;
  }
- 
--- 3524,3526 ----
diff --color -rcNP Master/drivers/tty/tty_io.c.orig OG/drivers/tty/tty_io.c.orig
*** Master/drivers/tty/tty_io.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/drivers/tty/tty_io.c.orig	2021-04-20 15:10:45.376000000 -0400
***************
*** 0 ****
--- 1,3526 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ /*
+  * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles
+  * or rs-channels. It also implements echoing, cooked mode etc.
+  *
+  * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0.
+  *
+  * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the
+  * tty_struct and tty_queue structures.  Previously there was an array
+  * of 256 tty_struct's which was statically allocated, and the
+  * tty_queue structures were allocated at boot time.  Both are now
+  * dynamically allocated only when the tty is open.
+  *
+  * Also restructured routines so that there is more of a separation
+  * between the high-level tty routines (tty_io.c and tty_ioctl.c) and
+  * the low-level tty routines (serial.c, pty.c, console.c).  This
+  * makes for cleaner and more compact code.  -TYT, 9/17/92
+  *
+  * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines
+  * which can be dynamically activated and de-activated by the line
+  * discipline handling modules (like SLIP).
+  *
+  * NOTE: pay no attention to the line discipline code (yet); its
+  * interface is still subject to change in this version...
+  * -- TYT, 1/31/92
+  *
+  * Added functionality to the OPOST tty handling.  No delays, but all
+  * other bits should be there.
+  *	-- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993.
+  *
+  * Rewrote canonical mode and added more termios flags.
+  * 	-- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94
+  *
+  * Reorganized FASYNC support so mouse code can share it.
+  *	-- ctm@ardi.com, 9Sep95
+  *
+  * New TIOCLINUX variants added.
+  *	-- mj@k332.feld.cvut.cz, 19-Nov-95
+  *
+  * Restrict vt switching via ioctl()
+  *      -- grif@cs.ucr.edu, 5-Dec-95
+  *
+  * Move console and virtual terminal code to more appropriate files,
+  * implement CONFIG_VT and generalize console device interface.
+  *	-- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97
+  *
+  * Rewrote tty_init_dev and tty_release_dev to eliminate races.
+  *	-- Bill Hawes <whawes@star.net>, June 97
+  *
+  * Added devfs support.
+  *      -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998
+  *
+  * Added support for a Unix98-style ptmx device.
+  *      -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998
+  *
+  * Reduced memory usage for older ARM systems
+  *      -- Russell King <rmk@arm.linux.org.uk>
+  *
+  * Move do_SAK() into process context.  Less stack use in devfs functions.
+  * alloc_tty_struct() always uses kmalloc()
+  *			 -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01
+  */
+ 
+ #include <linux/types.h>
+ #include <linux/major.h>
+ #include <linux/errno.h>
+ #include <linux/signal.h>
+ #include <linux/fcntl.h>
+ #include <linux/sched/signal.h>
+ #include <linux/sched/task.h>
+ #include <linux/interrupt.h>
+ #include <linux/tty.h>
+ #include <linux/tty_driver.h>
+ #include <linux/tty_flip.h>
+ #include <linux/devpts_fs.h>
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/console.h>
+ #include <linux/timer.h>
+ #include <linux/ctype.h>
+ #include <linux/kd.h>
+ #include <linux/mm.h>
+ #include <linux/string.h>
+ #include <linux/slab.h>
+ #include <linux/poll.h>
+ #include <linux/proc_fs.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+ #include <linux/device.h>
+ #include <linux/wait.h>
+ #include <linux/bitops.h>
+ #include <linux/delay.h>
+ #include <linux/seq_file.h>
+ #include <linux/serial.h>
+ #include <linux/ratelimit.h>
+ #include <linux/compat.h>
+ 
+ #include <linux/uaccess.h>
+ 
+ #include <linux/kbd_kern.h>
+ #include <linux/vt_kern.h>
+ #include <linux/selection.h>
+ 
+ #include <linux/kmod.h>
+ #include <linux/nsproxy.h>
+ 
+ #include <linux/minisec.h>
+ 
+ #undef TTY_DEBUG_HANGUP
+ #ifdef TTY_DEBUG_HANGUP
+ # define tty_debug_hangup(tty, f, args...)	tty_debug(tty, f, ##args)
+ #else
+ # define tty_debug_hangup(tty, f, args...)	do { } while (0)
+ #endif
+ 
+ #define TTY_PARANOIA_CHECK 1
+ #define CHECK_TTY_COUNT 1
+ 
+ struct ktermios tty_std_termios = {	/* for the benefit of tty drivers  */
+ 	.c_iflag = ICRNL | IXON,
+ 	.c_oflag = OPOST | ONLCR,
+ 	.c_cflag = B38400 | CS8 | CREAD | HUPCL,
+ 	.c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK |
+ 		   ECHOCTL | ECHOKE | IEXTEN,
+ 	.c_cc = INIT_C_CC,
+ 	.c_ispeed = 38400,
+ 	.c_ospeed = 38400,
+ 	/* .c_line = N_TTY, */
+ };
+ 
+ EXPORT_SYMBOL(tty_std_termios);
+ 
+ /* This list gets poked at by procfs and various bits of boot up code. This
+    could do with some rationalisation such as pulling the tty proc function
+    into this file */
+ 
+ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
+ 
+ /* Mutex to protect creating and releasing a tty */
+ DEFINE_MUTEX(tty_mutex);
+ 
+ static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
+ static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
+ ssize_t redirected_tty_write(struct file *, const char __user *,
+ 							size_t, loff_t *);
+ static __poll_t tty_poll(struct file *, poll_table *);
+ static int tty_open(struct inode *, struct file *);
+ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+ #ifdef CONFIG_COMPAT
+ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
+ 				unsigned long arg);
+ #else
+ #define tty_compat_ioctl NULL
+ #endif
+ static int __tty_fasync(int fd, struct file *filp, int on);
+ static int tty_fasync(int fd, struct file *filp, int on);
+ static void release_tty(struct tty_struct *tty, int idx);
+ 
+ /**
+  *	free_tty_struct		-	free a disused tty
+  *	@tty: tty struct to free
+  *
+  *	Free the write buffers, tty queue and tty memory itself.
+  *
+  *	Locking: none. Must be called after tty is definitely unused
+  */
+ 
+ static void free_tty_struct(struct tty_struct *tty)
+ {
+ 	tty_ldisc_deinit(tty);
+ 	put_device(tty->dev);
+ 	kfree(tty->write_buf);
+ 	tty->magic = 0xDEADDEAD;
+ 	put_user_ns(tty->owner_user_ns);
+ 	kfree(tty);
+ }
+ 
+ static inline struct tty_struct *file_tty(struct file *file)
+ {
+ 	return ((struct tty_file_private *)file->private_data)->tty;
+ }
+ 
+ int tty_alloc_file(struct file *file)
+ {
+ 	struct tty_file_private *priv;
+ 
+ 	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+ 	if (!priv)
+ 		return -ENOMEM;
+ 
+ 	file->private_data = priv;
+ 
+ 	return 0;
+ }
+ 
+ /* Associate a new file with the tty structure */
+ void tty_add_file(struct tty_struct *tty, struct file *file)
+ {
+ 	struct tty_file_private *priv = file->private_data;
+ 
+ 	priv->tty = tty;
+ 	priv->file = file;
+ 
+ 	spin_lock(&tty->files_lock);
+ 	list_add(&priv->list, &tty->tty_files);
+ 	spin_unlock(&tty->files_lock);
+ }
+ 
+ /**
+  * tty_free_file - free file->private_data
+  *
+  * This shall be used only for fail path handling when tty_add_file was not
+  * called yet.
+  */
+ void tty_free_file(struct file *file)
+ {
+ 	struct tty_file_private *priv = file->private_data;
+ 
+ 	file->private_data = NULL;
+ 	kfree(priv);
+ }
+ 
+ /* Delete file from its tty */
+ static void tty_del_file(struct file *file)
+ {
+ 	struct tty_file_private *priv = file->private_data;
+ 	struct tty_struct *tty = priv->tty;
+ 
+ 	spin_lock(&tty->files_lock);
+ 	list_del(&priv->list);
+ 	spin_unlock(&tty->files_lock);
+ 	tty_free_file(file);
+ }
+ 
+ /**
+  *	tty_name	-	return tty naming
+  *	@tty: tty structure
+  *
+  *	Convert a tty structure into a name. The name reflects the kernel
+  *	naming policy and if udev is in use may not reflect user space
+  *
+  *	Locking: none
+  */
+ 
+ const char *tty_name(const struct tty_struct *tty)
+ {
+ 	if (!tty) /* Hmm.  NULL pointer.  That's fun. */
+ 		return "NULL tty";
+ 	return tty->name;
+ }
+ 
+ EXPORT_SYMBOL(tty_name);
+ 
+ const char *tty_driver_name(const struct tty_struct *tty)
+ {
+ 	if (!tty || !tty->driver)
+ 		return "";
+ 	return tty->driver->name;
+ }
+ 
+ static int tty_paranoia_check(struct tty_struct *tty, struct inode *inode,
+ 			      const char *routine)
+ {
+ #ifdef TTY_PARANOIA_CHECK
+ 	if (!tty) {
+ 		pr_warn("(%d:%d): %s: NULL tty\n",
+ 			imajor(inode), iminor(inode), routine);
+ 		return 1;
+ 	}
+ 	if (tty->magic != TTY_MAGIC) {
+ 		pr_warn("(%d:%d): %s: bad magic number\n",
+ 			imajor(inode), iminor(inode), routine);
+ 		return 1;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ /* Caller must hold tty_lock */
+ static int check_tty_count(struct tty_struct *tty, const char *routine)
+ {
+ #ifdef CHECK_TTY_COUNT
+ 	struct list_head *p;
+ 	int count = 0, kopen_count = 0;
+ 
+ 	spin_lock(&tty->files_lock);
+ 	list_for_each(p, &tty->tty_files) {
+ 		count++;
+ 	}
+ 	spin_unlock(&tty->files_lock);
+ 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+ 	    tty->driver->subtype == PTY_TYPE_SLAVE &&
+ 	    tty->link && tty->link->count)
+ 		count++;
+ 	if (tty_port_kopened(tty->port))
+ 		kopen_count++;
+ 	if (tty->count != (count + kopen_count)) {
+ 		tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n",
+ 			 routine, tty->count, count, kopen_count);
+ 		return (count + kopen_count);
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ /**
+  *	get_tty_driver		-	find device of a tty
+  *	@dev_t: device identifier
+  *	@index: returns the index of the tty
+  *
+  *	This routine returns a tty driver structure, given a device number
+  *	and also passes back the index number.
+  *
+  *	Locking: caller must hold tty_mutex
+  */
+ 
+ static struct tty_driver *get_tty_driver(dev_t device, int *index)
+ {
+ 	struct tty_driver *p;
+ 
+ 	list_for_each_entry(p, &tty_drivers, tty_drivers) {
+ 		dev_t base = MKDEV(p->major, p->minor_start);
+ 		if (device < base || device >= base + p->num)
+ 			continue;
+ 		*index = device - base;
+ 		return tty_driver_kref_get(p);
+ 	}
+ 	return NULL;
+ }
+ 
+ /**
+  *	tty_dev_name_to_number	-	return dev_t for device name
+  *	@name: user space name of device under /dev
+  *	@number: pointer to dev_t that this function will populate
+  *
+  *	This function converts device names like ttyS0 or ttyUSB1 into dev_t
+  *	like (4, 64) or (188, 1). If no corresponding driver is registered then
+  *	the function returns -ENODEV.
+  *
+  *	Locking: this acquires tty_mutex to protect the tty_drivers list from
+  *		being modified while we are traversing it, and makes sure to
+  *		release it before exiting.
+  */
+ int tty_dev_name_to_number(const char *name, dev_t *number)
+ {
+ 	struct tty_driver *p;
+ 	int ret;
+ 	int index, prefix_length = 0;
+ 	const char *str;
+ 
+ 	for (str = name; *str && !isdigit(*str); str++)
+ 		;
+ 
+ 	if (!*str)
+ 		return -EINVAL;
+ 
+ 	ret = kstrtoint(str, 10, &index);
+ 	if (ret)
+ 		return ret;
+ 
+ 	prefix_length = str - name;
+ 	mutex_lock(&tty_mutex);
+ 
+ 	list_for_each_entry(p, &tty_drivers, tty_drivers)
+ 		if (prefix_length == strlen(p->name) && strncmp(name,
+ 					p->name, prefix_length) == 0) {
+ 			if (index < p->num) {
+ 				*number = MKDEV(p->major, p->minor_start + index);
+ 				goto out;
+ 			}
+ 		}
+ 
+ 	/* if here then driver wasn't found */
+ 	ret = -ENODEV;
+ out:
+ 	mutex_unlock(&tty_mutex);
+ 	return ret;
+ }
+ EXPORT_SYMBOL_GPL(tty_dev_name_to_number);
+ 
+ #ifdef CONFIG_CONSOLE_POLL
+ 
+ /**
+  *	tty_find_polling_driver	-	find device of a polled tty
+  *	@name: name string to match
+  *	@line: pointer to resulting tty line nr
+  *
+  *	This routine returns a tty driver structure, given a name
+  *	and the condition that the tty driver is capable of polled
+  *	operation.
+  */
+ struct tty_driver *tty_find_polling_driver(char *name, int *line)
+ {
+ 	struct tty_driver *p, *res = NULL;
+ 	int tty_line = 0;
+ 	int len;
+ 	char *str, *stp;
+ 
+ 	for (str = name; *str; str++)
+ 		if ((*str >= '0' && *str <= '9') || *str == ',')
+ 			break;
+ 	if (!*str)
+ 		return NULL;
+ 
+ 	len = str - name;
+ 	tty_line = simple_strtoul(str, &str, 10);
+ 
+ 	mutex_lock(&tty_mutex);
+ 	/* Search through the tty devices to look for a match */
+ 	list_for_each_entry(p, &tty_drivers, tty_drivers) {
+ 		if (!len || strncmp(name, p->name, len) != 0)
+ 			continue;
+ 		stp = str;
+ 		if (*stp == ',')
+ 			stp++;
+ 		if (*stp == '\0')
+ 			stp = NULL;
+ 
+ 		if (tty_line >= 0 && tty_line < p->num && p->ops &&
+ 		    p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) {
+ 			res = tty_driver_kref_get(p);
+ 			*line = tty_line;
+ 			break;
+ 		}
+ 	}
+ 	mutex_unlock(&tty_mutex);
+ 
+ 	return res;
+ }
+ EXPORT_SYMBOL_GPL(tty_find_polling_driver);
+ #endif
+ 
+ static ssize_t hung_up_tty_read(struct file *file, char __user *buf,
+ 				size_t count, loff_t *ppos)
+ {
+ 	return 0;
+ }
+ 
+ static ssize_t hung_up_tty_write(struct file *file, const char __user *buf,
+ 				 size_t count, loff_t *ppos)
+ {
+ 	return -EIO;
+ }
+ 
+ /* No kernel lock held - none needed ;) */
+ static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait)
+ {
+ 	return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM;
+ }
+ 
+ static long hung_up_tty_ioctl(struct file *file, unsigned int cmd,
+ 		unsigned long arg)
+ {
+ 	return cmd == TIOCSPGRP ? -ENOTTY : -EIO;
+ }
+ 
+ static long hung_up_tty_compat_ioctl(struct file *file,
+ 				     unsigned int cmd, unsigned long arg)
+ {
+ 	return cmd == TIOCSPGRP ? -ENOTTY : -EIO;
+ }
+ 
+ static int hung_up_tty_fasync(int fd, struct file *file, int on)
+ {
+ 	return -ENOTTY;
+ }
+ 
+ static void tty_show_fdinfo(struct seq_file *m, struct file *file)
+ {
+ 	struct tty_struct *tty = file_tty(file);
+ 
+ 	if (tty && tty->ops && tty->ops->show_fdinfo)
+ 		tty->ops->show_fdinfo(tty, m);
+ }
+ 
+ static const struct file_operations tty_fops = {
+ 	.llseek		= no_llseek,
+ 	.read		= tty_read,
+ 	.write		= tty_write,
+ 	.poll		= tty_poll,
+ 	.unlocked_ioctl	= tty_ioctl,
+ 	.compat_ioctl	= tty_compat_ioctl,
+ 	.open		= tty_open,
+ 	.release	= tty_release,
+ 	.fasync		= tty_fasync,
+ 	.show_fdinfo	= tty_show_fdinfo,
+ };
+ 
+ static const struct file_operations console_fops = {
+ 	.llseek		= no_llseek,
+ 	.read		= tty_read,
+ 	.write		= redirected_tty_write,
+ 	.poll		= tty_poll,
+ 	.unlocked_ioctl	= tty_ioctl,
+ 	.compat_ioctl	= tty_compat_ioctl,
+ 	.open		= tty_open,
+ 	.release	= tty_release,
+ 	.fasync		= tty_fasync,
+ };
+ 
+ static const struct file_operations hung_up_tty_fops = {
+ 	.llseek		= no_llseek,
+ 	.read		= hung_up_tty_read,
+ 	.write		= hung_up_tty_write,
+ 	.poll		= hung_up_tty_poll,
+ 	.unlocked_ioctl	= hung_up_tty_ioctl,
+ 	.compat_ioctl	= hung_up_tty_compat_ioctl,
+ 	.release	= tty_release,
+ 	.fasync		= hung_up_tty_fasync,
+ };
+ 
+ static DEFINE_SPINLOCK(redirect_lock);
+ static struct file *redirect;
+ 
+ extern void tty_sysctl_init(void);
+ 
+ /**
+  *	tty_wakeup	-	request more data
+  *	@tty: terminal
+  *
+  *	Internal and external helper for wakeups of tty. This function
+  *	informs the line discipline if present that the driver is ready
+  *	to receive more output data.
+  */
+ 
+ void tty_wakeup(struct tty_struct *tty)
+ {
+ 	struct tty_ldisc *ld;
+ 
+ 	if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) {
+ 		ld = tty_ldisc_ref(tty);
+ 		if (ld) {
+ 			if (ld->ops->write_wakeup)
+ 				ld->ops->write_wakeup(tty);
+ 			tty_ldisc_deref(ld);
+ 		}
+ 	}
+ 	wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT);
+ }
+ 
+ EXPORT_SYMBOL_GPL(tty_wakeup);
+ 
+ /**
+  *	__tty_hangup		-	actual handler for hangup events
+  *	@work: tty device
+  *
+  *	This can be called by a "kworker" kernel thread.  That is process
+  *	synchronous but doesn't hold any locks, so we need to make sure we
+  *	have the appropriate locks for what we're doing.
+  *
+  *	The hangup event clears any pending redirections onto the hung up
+  *	device. It ensures future writes will error and it does the needed
+  *	line discipline hangup and signal delivery. The tty object itself
+  *	remains intact.
+  *
+  *	Locking:
+  *		BTM
+  *		  redirect lock for undoing redirection
+  *		  file list lock for manipulating list of ttys
+  *		  tty_ldiscs_lock from called functions
+  *		  termios_rwsem resetting termios data
+  *		  tasklist_lock to walk task list for hangup event
+  *		    ->siglock to protect ->signal/->sighand
+  */
+ static void __tty_hangup(struct tty_struct *tty, int exit_session)
+ {
+ 	struct file *cons_filp = NULL;
+ 	struct file *filp, *f = NULL;
+ 	struct tty_file_private *priv;
+ 	int    closecount = 0, n;
+ 	int refs;
+ 
+ 	if (!tty)
+ 		return;
+ 
+ 
+ 	spin_lock(&redirect_lock);
+ 	if (redirect && file_tty(redirect) == tty) {
+ 		f = redirect;
+ 		redirect = NULL;
+ 	}
+ 	spin_unlock(&redirect_lock);
+ 
+ 	tty_lock(tty);
+ 
+ 	if (test_bit(TTY_HUPPED, &tty->flags)) {
+ 		tty_unlock(tty);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Some console devices aren't actually hung up for technical and
+ 	 * historical reasons, which can lead to indefinite interruptible
+ 	 * sleep in n_tty_read().  The following explicitly tells
+ 	 * n_tty_read() to abort readers.
+ 	 */
+ 	set_bit(TTY_HUPPING, &tty->flags);
+ 
+ 	/* inuse_filps is protected by the single tty lock,
+ 	   this really needs to change if we want to flush the
+ 	   workqueue with the lock held */
+ 	check_tty_count(tty, "tty_hangup");
+ 
+ 	spin_lock(&tty->files_lock);
+ 	/* This breaks for file handles being sent over AF_UNIX sockets ? */
+ 	list_for_each_entry(priv, &tty->tty_files, list) {
+ 		filp = priv->file;
+ 		if (filp->f_op->write == redirected_tty_write)
+ 			cons_filp = filp;
+ 		if (filp->f_op->write != tty_write)
+ 			continue;
+ 		closecount++;
+ 		__tty_fasync(-1, filp, 0);	/* can't block */
+ 		filp->f_op = &hung_up_tty_fops;
+ 	}
+ 	spin_unlock(&tty->files_lock);
+ 
+ 	refs = tty_signal_session_leader(tty, exit_session);
+ 	/* Account for the p->signal references we killed */
+ 	while (refs--)
+ 		tty_kref_put(tty);
+ 
+ 	tty_ldisc_hangup(tty, cons_filp != NULL);
+ 
+ 	spin_lock_irq(&tty->ctrl_lock);
+ 	clear_bit(TTY_THROTTLED, &tty->flags);
+ 	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
+ 	put_pid(tty->session);
+ 	put_pid(tty->pgrp);
+ 	tty->session = NULL;
+ 	tty->pgrp = NULL;
+ 	tty->ctrl_status = 0;
+ 	spin_unlock_irq(&tty->ctrl_lock);
+ 
+ 	/*
+ 	 * If one of the devices matches a console pointer, we
+ 	 * cannot just call hangup() because that will cause
+ 	 * tty->count and state->count to go out of sync.
+ 	 * So we just call close() the right number of times.
+ 	 */
+ 	if (cons_filp) {
+ 		if (tty->ops->close)
+ 			for (n = 0; n < closecount; n++)
+ 				tty->ops->close(tty, cons_filp);
+ 	} else if (tty->ops->hangup)
+ 		tty->ops->hangup(tty);
+ 	/*
+ 	 * We don't want to have driver/ldisc interactions beyond the ones
+ 	 * we did here. The driver layer expects no calls after ->hangup()
+ 	 * from the ldisc side, which is now guaranteed.
+ 	 */
+ 	set_bit(TTY_HUPPED, &tty->flags);
+ 	clear_bit(TTY_HUPPING, &tty->flags);
+ 	tty_unlock(tty);
+ 
+ 	if (f)
+ 		fput(f);
+ }
+ 
+ static void do_tty_hangup(struct work_struct *work)
+ {
+ 	struct tty_struct *tty =
+ 		container_of(work, struct tty_struct, hangup_work);
+ 
+ 	__tty_hangup(tty, 0);
+ }
+ 
+ /**
+  *	tty_hangup		-	trigger a hangup event
+  *	@tty: tty to hangup
+  *
+  *	A carrier loss (virtual or otherwise) has occurred on this like
+  *	schedule a hangup sequence to run after this event.
+  */
+ 
+ void tty_hangup(struct tty_struct *tty)
+ {
+ 	tty_debug_hangup(tty, "hangup\n");
+ 	schedule_work(&tty->hangup_work);
+ }
+ 
+ EXPORT_SYMBOL(tty_hangup);
+ 
+ /**
+  *	tty_vhangup		-	process vhangup
+  *	@tty: tty to hangup
+  *
+  *	The user has asked via system call for the terminal to be hung up.
+  *	We do this synchronously so that when the syscall returns the process
+  *	is complete. That guarantee is necessary for security reasons.
+  */
+ 
+ void tty_vhangup(struct tty_struct *tty)
+ {
+ 	tty_debug_hangup(tty, "vhangup\n");
+ 	__tty_hangup(tty, 0);
+ }
+ 
+ EXPORT_SYMBOL(tty_vhangup);
+ 
+ 
+ /**
+  *	tty_vhangup_self	-	process vhangup for own ctty
+  *
+  *	Perform a vhangup on the current controlling tty
+  */
+ 
+ void tty_vhangup_self(void)
+ {
+ 	struct tty_struct *tty;
+ 
+ 	tty = get_current_tty();
+ 	if (tty) {
+ 		tty_vhangup(tty);
+ 		tty_kref_put(tty);
+ 	}
+ }
+ 
+ /**
+  *	tty_vhangup_session		-	hangup session leader exit
+  *	@tty: tty to hangup
+  *
+  *	The session leader is exiting and hanging up its controlling terminal.
+  *	Every process in the foreground process group is signalled SIGHUP.
+  *
+  *	We do this synchronously so that when the syscall returns the process
+  *	is complete. That guarantee is necessary for security reasons.
+  */
+ 
+ void tty_vhangup_session(struct tty_struct *tty)
+ {
+ 	tty_debug_hangup(tty, "session hangup\n");
+ 	__tty_hangup(tty, 1);
+ }
+ 
+ /**
+  *	tty_hung_up_p		-	was tty hung up
+  *	@filp: file pointer of tty
+  *
+  *	Return true if the tty has been subject to a vhangup or a carrier
+  *	loss
+  */
+ 
+ int tty_hung_up_p(struct file *filp)
+ {
+ 	return (filp && filp->f_op == &hung_up_tty_fops);
+ }
+ 
+ EXPORT_SYMBOL(tty_hung_up_p);
+ 
+ /**
+  *	stop_tty	-	propagate flow control
+  *	@tty: tty to stop
+  *
+  *	Perform flow control to the driver. May be called
+  *	on an already stopped device and will not re-call the driver
+  *	method.
+  *
+  *	This functionality is used by both the line disciplines for
+  *	halting incoming flow and by the driver. It may therefore be
+  *	called from any context, may be under the tty atomic_write_lock
+  *	but not always.
+  *
+  *	Locking:
+  *		flow_lock
+  */
+ 
+ void __stop_tty(struct tty_struct *tty)
+ {
+ 	if (tty->stopped)
+ 		return;
+ 	tty->stopped = 1;
+ 	if (tty->ops->stop)
+ 		tty->ops->stop(tty);
+ }
+ 
+ void stop_tty(struct tty_struct *tty)
+ {
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&tty->flow_lock, flags);
+ 	__stop_tty(tty);
+ 	spin_unlock_irqrestore(&tty->flow_lock, flags);
+ }
+ EXPORT_SYMBOL(stop_tty);
+ 
+ /**
+  *	start_tty	-	propagate flow control
+  *	@tty: tty to start
+  *
+  *	Start a tty that has been stopped if at all possible. If this
+  *	tty was previous stopped and is now being started, the driver
+  *	start method is invoked and the line discipline woken.
+  *
+  *	Locking:
+  *		flow_lock
+  */
+ 
+ void __start_tty(struct tty_struct *tty)
+ {
+ 	if (!tty->stopped || tty->flow_stopped)
+ 		return;
+ 	tty->stopped = 0;
+ 	if (tty->ops->start)
+ 		tty->ops->start(tty);
+ 	tty_wakeup(tty);
+ }
+ 
+ void start_tty(struct tty_struct *tty)
+ {
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&tty->flow_lock, flags);
+ 	__start_tty(tty);
+ 	spin_unlock_irqrestore(&tty->flow_lock, flags);
+ }
+ EXPORT_SYMBOL(start_tty);
+ 
+ static void tty_update_time(struct timespec64 *time)
+ {
+ 	time64_t sec = ktime_get_real_seconds();
+ 
+ 	/*
+ 	 * We only care if the two values differ in anything other than the
+ 	 * lower three bits (i.e every 8 seconds).  If so, then we can update
+ 	 * the time of the tty device, otherwise it could be construded as a
+ 	 * security leak to let userspace know the exact timing of the tty.
+ 	 */
+ 	if ((sec ^ time->tv_sec) & ~7)
+ 		time->tv_sec = sec;
+ }
+ 
+ /**
+  *	tty_read	-	read method for tty device files
+  *	@file: pointer to tty file
+  *	@buf: user buffer
+  *	@count: size of user buffer
+  *	@ppos: unused
+  *
+  *	Perform the read system call function on this terminal device. Checks
+  *	for hung up devices before calling the line discipline method.
+  *
+  *	Locking:
+  *		Locks the line discipline internally while needed. Multiple
+  *	read calls may be outstanding in parallel.
+  */
+ 
+ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
+ 			loff_t *ppos)
+ {
+ 	int i;
+ 	struct inode *inode = file_inode(file);
+ 	struct tty_struct *tty = file_tty(file);
+ 	struct tty_ldisc *ld;
+ 
+ 	if (tty_paranoia_check(tty, inode, "tty_read"))
+ 		return -EIO;
+ 	if (!tty || tty_io_error(tty))
+ 		return -EIO;
+ 
+ 	/* We want to wait for the line discipline to sort out in this
+ 	   situation */
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return hung_up_tty_read(file, buf, count, ppos);
+ 	if (ld->ops->read)
+ 		i = ld->ops->read(tty, file, buf, count);
+ 	else
+ 		i = -EIO;
+ 	tty_ldisc_deref(ld);
+ 
+ 	if (i > 0)
+ 		tty_update_time(&inode->i_atime);
+ 
+ 	return i;
+ }
+ 
+ static void tty_write_unlock(struct tty_struct *tty)
+ {
+ 	mutex_unlock(&tty->atomic_write_lock);
+ 	wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT);
+ }
+ 
+ static int tty_write_lock(struct tty_struct *tty, int ndelay)
+ {
+ 	if (!mutex_trylock(&tty->atomic_write_lock)) {
+ 		if (ndelay)
+ 			return -EAGAIN;
+ 		if (mutex_lock_interruptible(&tty->atomic_write_lock))
+ 			return -ERESTARTSYS;
+ 	}
+ 	return 0;
+ }
+ 
+ /*
+  * Split writes up in sane blocksizes to avoid
+  * denial-of-service type attacks
+  */
+ static inline ssize_t do_tty_write(
+ 	ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t),
+ 	struct tty_struct *tty,
+ 	struct file *file,
+ 	const char __user *buf,
+ 	size_t count)
+ {
+ 	ssize_t ret, written = 0;
+ 	unsigned int chunk;
+ 
+ 	ret = tty_write_lock(tty, file->f_flags & O_NDELAY);
+ 	if (ret < 0)
+ 		return ret;
+ 
+ 	/*
+ 	 * We chunk up writes into a temporary buffer. This
+ 	 * simplifies low-level drivers immensely, since they
+ 	 * don't have locking issues and user mode accesses.
+ 	 *
+ 	 * But if TTY_NO_WRITE_SPLIT is set, we should use a
+ 	 * big chunk-size..
+ 	 *
+ 	 * The default chunk-size is 2kB, because the NTTY
+ 	 * layer has problems with bigger chunks. It will
+ 	 * claim to be able to handle more characters than
+ 	 * it actually does.
+ 	 *
+ 	 * FIXME: This can probably go away now except that 64K chunks
+ 	 * are too likely to fail unless switched to vmalloc...
+ 	 */
+ 	chunk = 2048;
+ 	if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags))
+ 		chunk = 65536;
+ 	if (count < chunk)
+ 		chunk = count;
+ 
+ 	/* write_buf/write_cnt is protected by the atomic_write_lock mutex */
+ 	if (tty->write_cnt < chunk) {
+ 		unsigned char *buf_chunk;
+ 
+ 		if (chunk < 1024)
+ 			chunk = 1024;
+ 
+ 		buf_chunk = kmalloc(chunk, GFP_KERNEL);
+ 		if (!buf_chunk) {
+ 			ret = -ENOMEM;
+ 			goto out;
+ 		}
+ 		kfree(tty->write_buf);
+ 		tty->write_cnt = chunk;
+ 		tty->write_buf = buf_chunk;
+ 	}
+ 
+ 	/* Do the write .. */
+ 	for (;;) {
+ 		size_t size = count;
+ 		if (size > chunk)
+ 			size = chunk;
+ 		ret = -EFAULT;
+ 		if (copy_from_user(tty->write_buf, buf, size))
+ 			break;
+ 		ret = write(tty, file, tty->write_buf, size);
+ 		if (ret <= 0)
+ 			break;
+ 		written += ret;
+ 		buf += ret;
+ 		count -= ret;
+ 		if (!count)
+ 			break;
+ 		ret = -ERESTARTSYS;
+ 		if (signal_pending(current))
+ 			break;
+ 		cond_resched();
+ 	}
+ 	if (written) {
+ 		tty_update_time(&file_inode(file)->i_mtime);
+ 		ret = written;
+ 	}
+ out:
+ 	tty_write_unlock(tty);
+ 	return ret;
+ }
+ 
+ /**
+  * tty_write_message - write a message to a certain tty, not just the console.
+  * @tty: the destination tty_struct
+  * @msg: the message to write
+  *
+  * This is used for messages that need to be redirected to a specific tty.
+  * We don't put it into the syslog queue right now maybe in the future if
+  * really needed.
+  *
+  * We must still hold the BTM and test the CLOSING flag for the moment.
+  */
+ 
+ void tty_write_message(struct tty_struct *tty, char *msg)
+ {
+ 	if (tty) {
+ 		mutex_lock(&tty->atomic_write_lock);
+ 		tty_lock(tty);
+ 		if (tty->ops->write && tty->count > 0)
+ 			tty->ops->write(tty, msg, strlen(msg));
+ 		tty_unlock(tty);
+ 		tty_write_unlock(tty);
+ 	}
+ 	return;
+ }
+ 
+ 
+ /**
+  *	tty_write		-	write method for tty device file
+  *	@file: tty file pointer
+  *	@buf: user data to write
+  *	@count: bytes to write
+  *	@ppos: unused
+  *
+  *	Write data to a tty device via the line discipline.
+  *
+  *	Locking:
+  *		Locks the line discipline as required
+  *		Writes to the tty driver are serialized by the atomic_write_lock
+  *	and are then processed in chunks to the device. The line discipline
+  *	write method will not be invoked in parallel for each device.
+  */
+ 
+ static ssize_t tty_write(struct file *file, const char __user *buf,
+ 						size_t count, loff_t *ppos)
+ {
+ 	struct tty_struct *tty = file_tty(file);
+  	struct tty_ldisc *ld;
+ 	ssize_t ret;
+ 
+ 	if (tty_paranoia_check(tty, file_inode(file), "tty_write"))
+ 		return -EIO;
+ 	if (!tty || !tty->ops->write ||	tty_io_error(tty))
+ 			return -EIO;
+ 	/* Short term debug to catch buggy drivers */
+ 	if (tty->ops->write_room == NULL)
+ 		tty_err(tty, "missing write_room method\n");
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return hung_up_tty_write(file, buf, count, ppos);
+ 	if (!ld->ops->write)
+ 		ret = -EIO;
+ 	else
+ 		ret = do_tty_write(ld->ops->write, tty, file, buf, count);
+ 	tty_ldisc_deref(ld);
+ 	return ret;
+ }
+ 
+ ssize_t redirected_tty_write(struct file *file, const char __user *buf,
+ 						size_t count, loff_t *ppos)
+ {
+ 	struct file *p = NULL;
+ 
+ 	spin_lock(&redirect_lock);
+ 	if (redirect)
+ 		p = get_file(redirect);
+ 	spin_unlock(&redirect_lock);
+ 
+ 	if (p) {
+ 		ssize_t res;
+ 		res = vfs_write(p, buf, count, &p->f_pos);
+ 		fput(p);
+ 		return res;
+ 	}
+ 	return tty_write(file, buf, count, ppos);
+ }
+ 
+ /**
+  *	tty_send_xchar	-	send priority character
+  *
+  *	Send a high priority character to the tty even if stopped
+  *
+  *	Locking: none for xchar method, write ordering for write method.
+  */
+ 
+ int tty_send_xchar(struct tty_struct *tty, char ch)
+ {
+ 	int	was_stopped = tty->stopped;
+ 
+ 	if (tty->ops->send_xchar) {
+ 		down_read(&tty->termios_rwsem);
+ 		tty->ops->send_xchar(tty, ch);
+ 		up_read(&tty->termios_rwsem);
+ 		return 0;
+ 	}
+ 
+ 	if (tty_write_lock(tty, 0) < 0)
+ 		return -ERESTARTSYS;
+ 
+ 	down_read(&tty->termios_rwsem);
+ 	if (was_stopped)
+ 		start_tty(tty);
+ 	tty->ops->write(tty, &ch, 1);
+ 	if (was_stopped)
+ 		stop_tty(tty);
+ 	up_read(&tty->termios_rwsem);
+ 	tty_write_unlock(tty);
+ 	return 0;
+ }
+ 
+ static char ptychar[] = "pqrstuvwxyzabcde";
+ 
+ /**
+  *	pty_line_name	-	generate name for a pty
+  *	@driver: the tty driver in use
+  *	@index: the minor number
+  *	@p: output buffer of at least 6 bytes
+  *
+  *	Generate a name from a driver reference and write it to the output
+  *	buffer.
+  *
+  *	Locking: None
+  */
+ static void pty_line_name(struct tty_driver *driver, int index, char *p)
+ {
+ 	int i = index + driver->name_base;
+ 	/* ->name is initialized to "ttyp", but "tty" is expected */
+ 	sprintf(p, "%s%c%x",
+ 		driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name,
+ 		ptychar[i >> 4 & 0xf], i & 0xf);
+ }
+ 
+ /**
+  *	tty_line_name	-	generate name for a tty
+  *	@driver: the tty driver in use
+  *	@index: the minor number
+  *	@p: output buffer of at least 7 bytes
+  *
+  *	Generate a name from a driver reference and write it to the output
+  *	buffer.
+  *
+  *	Locking: None
+  */
+ static ssize_t tty_line_name(struct tty_driver *driver, int index, char *p)
+ {
+ 	if (driver->flags & TTY_DRIVER_UNNUMBERED_NODE)
+ 		return sprintf(p, "%s", driver->name);
+ 	else
+ 		return sprintf(p, "%s%d", driver->name,
+ 			       index + driver->name_base);
+ }
+ 
+ /**
+  *	tty_driver_lookup_tty() - find an existing tty, if any
+  *	@driver: the driver for the tty
+  *	@idx:	 the minor number
+  *
+  *	Return the tty, if found. If not found, return NULL or ERR_PTR() if the
+  *	driver lookup() method returns an error.
+  *
+  *	Locking: tty_mutex must be held. If the tty is found, bump the tty kref.
+  */
+ static struct tty_struct *tty_driver_lookup_tty(struct tty_driver *driver,
+ 		struct file *file, int idx)
+ {
+ 	struct tty_struct *tty;
+ 
+ 	if (driver->ops->lookup)
+ 		if (!file)
+ 			tty = ERR_PTR(-EIO);
+ 		else
+ 			tty = driver->ops->lookup(driver, file, idx);
+ 	else
+ 		tty = driver->ttys[idx];
+ 
+ 	if (!IS_ERR(tty))
+ 		tty_kref_get(tty);
+ 	return tty;
+ }
+ 
+ /**
+  *	tty_init_termios	-  helper for termios setup
+  *	@tty: the tty to set up
+  *
+  *	Initialise the termios structure for this tty. This runs under
+  *	the tty_mutex currently so we can be relaxed about ordering.
+  */
+ 
+ void tty_init_termios(struct tty_struct *tty)
+ {
+ 	struct ktermios *tp;
+ 	int idx = tty->index;
+ 
+ 	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS)
+ 		tty->termios = tty->driver->init_termios;
+ 	else {
+ 		/* Check for lazy saved data */
+ 		tp = tty->driver->termios[idx];
+ 		if (tp != NULL) {
+ 			tty->termios = *tp;
+ 			tty->termios.c_line  = tty->driver->init_termios.c_line;
+ 		} else
+ 			tty->termios = tty->driver->init_termios;
+ 	}
+ 	/* Compatibility until drivers always set this */
+ 	tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios);
+ 	tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios);
+ }
+ EXPORT_SYMBOL_GPL(tty_init_termios);
+ 
+ int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty)
+ {
+ 	tty_init_termios(tty);
+ 	tty_driver_kref_get(driver);
+ 	tty->count++;
+ 	driver->ttys[tty->index] = tty;
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(tty_standard_install);
+ 
+ /**
+  *	tty_driver_install_tty() - install a tty entry in the driver
+  *	@driver: the driver for the tty
+  *	@tty: the tty
+  *
+  *	Install a tty object into the driver tables. The tty->index field
+  *	will be set by the time this is called. This method is responsible
+  *	for ensuring any need additional structures are allocated and
+  *	configured.
+  *
+  *	Locking: tty_mutex for now
+  */
+ static int tty_driver_install_tty(struct tty_driver *driver,
+ 						struct tty_struct *tty)
+ {
+ 	return driver->ops->install ? driver->ops->install(driver, tty) :
+ 		tty_standard_install(driver, tty);
+ }
+ 
+ /**
+  *	tty_driver_remove_tty() - remove a tty from the driver tables
+  *	@driver: the driver for the tty
+  *	@idx:	 the minor number
+  *
+  *	Remvoe a tty object from the driver tables. The tty->index field
+  *	will be set by the time this is called.
+  *
+  *	Locking: tty_mutex for now
+  */
+ static void tty_driver_remove_tty(struct tty_driver *driver, struct tty_struct *tty)
+ {
+ 	if (driver->ops->remove)
+ 		driver->ops->remove(driver, tty);
+ 	else
+ 		driver->ttys[tty->index] = NULL;
+ }
+ 
+ /*
+  * 	tty_reopen()	- fast re-open of an open tty
+  * 	@tty	- the tty to open
+  *
+  *	Return 0 on success, -errno on error.
+  *	Re-opens on master ptys are not allowed and return -EIO.
+  *
+  *	Locking: Caller must hold tty_lock
+  */
+ static int tty_reopen(struct tty_struct *tty)
+ {
+ 	struct tty_driver *driver = tty->driver;
+ 	struct tty_ldisc *ld;
+ 	int retval = 0;
+ 
+ 	if (driver->type == TTY_DRIVER_TYPE_PTY &&
+ 	    driver->subtype == PTY_TYPE_MASTER)
+ 		return -EIO;
+ 
+ 	if (!tty->count)
+ 		return -EAGAIN;
+ 
+ 	if (test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN))
+ 		return -EBUSY;
+ 
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (ld) {
+ 		tty_ldisc_deref(ld);
+ 	} else {
+ 		retval = tty_ldisc_lock(tty, 5 * HZ);
+ 		if (retval)
+ 			return retval;
+ 
+ 		if (!tty->ldisc)
+ 			retval = tty_ldisc_reinit(tty, tty->termios.c_line);
+ 		tty_ldisc_unlock(tty);
+ 	}
+ 
+ 	if (retval == 0)
+ 		tty->count++;
+ 
+ 	return retval;
+ }
+ 
+ /**
+  *	tty_init_dev		-	initialise a tty device
+  *	@driver: tty driver we are opening a device on
+  *	@idx: device index
+  *	@ret_tty: returned tty structure
+  *
+  *	Prepare a tty device. This may not be a "new" clean device but
+  *	could also be an active device. The pty drivers require special
+  *	handling because of this.
+  *
+  *	Locking:
+  *		The function is called under the tty_mutex, which
+  *	protects us from the tty struct or driver itself going away.
+  *
+  *	On exit the tty device has the line discipline attached and
+  *	a reference count of 1. If a pair was created for pty/tty use
+  *	and the other was a pty master then it too has a reference count of 1.
+  *
+  * WSH 06/09/97: Rewritten to remove races and properly clean up after a
+  * failed open.  The new code protects the open with a mutex, so it's
+  * really quite straightforward.  The mutex locking can probably be
+  * relaxed for the (most common) case of reopening a tty.
+  */
+ 
+ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx)
+ {
+ 	struct tty_struct *tty;
+ 	int retval;
+ 
+ 	/*
+ 	 * First time open is complex, especially for PTY devices.
+ 	 * This code guarantees that either everything succeeds and the
+ 	 * TTY is ready for operation, or else the table slots are vacated
+ 	 * and the allocated memory released.  (Except that the termios
+ 	 * may be retained.)
+ 	 */
+ 
+ 	if (!try_module_get(driver->owner))
+ 		return ERR_PTR(-ENODEV);
+ 
+ 	tty = alloc_tty_struct(driver, idx);
+ 	if (!tty) {
+ 		retval = -ENOMEM;
+ 		goto err_module_put;
+ 	}
+ 
+ 	tty_lock(tty);
+ 	retval = tty_driver_install_tty(driver, tty);
+ 	if (retval < 0)
+ 		goto err_free_tty;
+ 
+ 	if (!tty->port)
+ 		tty->port = driver->ports[idx];
+ 
+ 	WARN_RATELIMIT(!tty->port,
+ 			"%s: %s driver does not set tty->port. This will crash the kernel later. Fix the driver!\n",
+ 			__func__, tty->driver->name);
+ 
+ 	retval = tty_ldisc_lock(tty, 5 * HZ);
+ 	if (retval)
+ 		goto err_release_lock;
+ 	tty->port->itty = tty;
+ 
+ 	/*
+ 	 * Structures all installed ... call the ldisc open routines.
+ 	 * If we fail here just call release_tty to clean up.  No need
+ 	 * to decrement the use counts, as release_tty doesn't care.
+ 	 */
+ 	retval = tty_ldisc_setup(tty, tty->link);
+ 	if (retval)
+ 		goto err_release_tty;
+ 	tty_ldisc_unlock(tty);
+ 	/* Return the tty locked so that it cannot vanish under the caller */
+ 	return tty;
+ 
+ err_free_tty:
+ 	tty_unlock(tty);
+ 	free_tty_struct(tty);
+ err_module_put:
+ 	module_put(driver->owner);
+ 	return ERR_PTR(retval);
+ 
+ 	/* call the tty release_tty routine to clean out this slot */
+ err_release_tty:
+ 	tty_ldisc_unlock(tty);
+ 	tty_info_ratelimited(tty, "ldisc open failed (%d), clearing slot %d\n",
+ 			     retval, idx);
+ err_release_lock:
+ 	tty_unlock(tty);
+ 	release_tty(tty, idx);
+ 	return ERR_PTR(retval);
+ }
+ 
+ /**
+  * tty_save_termios() - save tty termios data in driver table
+  * @tty: tty whose termios data to save
+  *
+  * Locking: Caller guarantees serialisation with tty_init_termios().
+  */
+ void tty_save_termios(struct tty_struct *tty)
+ {
+ 	struct ktermios *tp;
+ 	int idx = tty->index;
+ 
+ 	/* If the port is going to reset then it has no termios to save */
+ 	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS)
+ 		return;
+ 
+ 	/* Stash the termios data */
+ 	tp = tty->driver->termios[idx];
+ 	if (tp == NULL) {
+ 		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL);
+ 		if (tp == NULL)
+ 			return;
+ 		tty->driver->termios[idx] = tp;
+ 	}
+ 	*tp = tty->termios;
+ }
+ EXPORT_SYMBOL_GPL(tty_save_termios);
+ 
+ /**
+  *	tty_flush_works		-	flush all works of a tty/pty pair
+  *	@tty: tty device to flush works for (or either end of a pty pair)
+  *
+  *	Sync flush all works belonging to @tty (and the 'other' tty).
+  */
+ static void tty_flush_works(struct tty_struct *tty)
+ {
+ 	flush_work(&tty->SAK_work);
+ 	flush_work(&tty->hangup_work);
+ 	if (tty->link) {
+ 		flush_work(&tty->link->SAK_work);
+ 		flush_work(&tty->link->hangup_work);
+ 	}
+ }
+ 
+ /**
+  *	release_one_tty		-	release tty structure memory
+  *	@kref: kref of tty we are obliterating
+  *
+  *	Releases memory associated with a tty structure, and clears out the
+  *	driver table slots. This function is called when a device is no longer
+  *	in use. It also gets called when setup of a device fails.
+  *
+  *	Locking:
+  *		takes the file list lock internally when working on the list
+  *	of ttys that the driver keeps.
+  *
+  *	This method gets called from a work queue so that the driver private
+  *	cleanup ops can sleep (needed for USB at least)
+  */
+ static void release_one_tty(struct work_struct *work)
+ {
+ 	struct tty_struct *tty =
+ 		container_of(work, struct tty_struct, hangup_work);
+ 	struct tty_driver *driver = tty->driver;
+ 	struct module *owner = driver->owner;
+ 
+ 	if (tty->ops->cleanup)
+ 		tty->ops->cleanup(tty);
+ 
+ 	tty->magic = 0;
+ 	tty_driver_kref_put(driver);
+ 	module_put(owner);
+ 
+ 	spin_lock(&tty->files_lock);
+ 	list_del_init(&tty->tty_files);
+ 	spin_unlock(&tty->files_lock);
+ 
+ 	put_pid(tty->pgrp);
+ 	put_pid(tty->session);
+ 	free_tty_struct(tty);
+ }
+ 
+ static void queue_release_one_tty(struct kref *kref)
+ {
+ 	struct tty_struct *tty = container_of(kref, struct tty_struct, kref);
+ 
+ 	/* The hangup queue is now free so we can reuse it rather than
+ 	   waste a chunk of memory for each port */
+ 	INIT_WORK(&tty->hangup_work, release_one_tty);
+ 	schedule_work(&tty->hangup_work);
+ }
+ 
+ /**
+  *	tty_kref_put		-	release a tty kref
+  *	@tty: tty device
+  *
+  *	Release a reference to a tty device and if need be let the kref
+  *	layer destruct the object for us
+  */
+ 
+ void tty_kref_put(struct tty_struct *tty)
+ {
+ 	if (tty)
+ 		kref_put(&tty->kref, queue_release_one_tty);
+ }
+ EXPORT_SYMBOL(tty_kref_put);
+ 
+ /**
+  *	release_tty		-	release tty structure memory
+  *
+  *	Release both @tty and a possible linked partner (think pty pair),
+  *	and decrement the refcount of the backing module.
+  *
+  *	Locking:
+  *		tty_mutex
+  *		takes the file list lock internally when working on the list
+  *	of ttys that the driver keeps.
+  *
+  */
+ static void release_tty(struct tty_struct *tty, int idx)
+ {
+ 	/* This should always be true but check for the moment */
+ 	WARN_ON(tty->index != idx);
+ 	WARN_ON(!mutex_is_locked(&tty_mutex));
+ 	if (tty->ops->shutdown)
+ 		tty->ops->shutdown(tty);
+ 	tty_save_termios(tty);
+ 	tty_driver_remove_tty(tty->driver, tty);
+ 	tty->port->itty = NULL;
+ 	if (tty->link)
+ 		tty->link->port->itty = NULL;
+ 	tty_buffer_cancel_work(tty->port);
+ 	if (tty->link)
+ 		tty_buffer_cancel_work(tty->link->port);
+ 
+ 	tty_kref_put(tty->link);
+ 	tty_kref_put(tty);
+ }
+ 
+ /**
+  *	tty_release_checks - check a tty before real release
+  *	@tty: tty to check
+  *	@o_tty: link of @tty (if any)
+  *	@idx: index of the tty
+  *
+  *	Performs some paranoid checking before true release of the @tty.
+  *	This is a no-op unless TTY_PARANOIA_CHECK is defined.
+  */
+ static int tty_release_checks(struct tty_struct *tty, int idx)
+ {
+ #ifdef TTY_PARANOIA_CHECK
+ 	if (idx < 0 || idx >= tty->driver->num) {
+ 		tty_debug(tty, "bad idx %d\n", idx);
+ 		return -1;
+ 	}
+ 
+ 	/* not much to check for devpts */
+ 	if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM)
+ 		return 0;
+ 
+ 	if (tty != tty->driver->ttys[idx]) {
+ 		tty_debug(tty, "bad driver table[%d] = %p\n",
+ 			  idx, tty->driver->ttys[idx]);
+ 		return -1;
+ 	}
+ 	if (tty->driver->other) {
+ 		struct tty_struct *o_tty = tty->link;
+ 
+ 		if (o_tty != tty->driver->other->ttys[idx]) {
+ 			tty_debug(tty, "bad other table[%d] = %p\n",
+ 				  idx, tty->driver->other->ttys[idx]);
+ 			return -1;
+ 		}
+ 		if (o_tty->link != tty) {
+ 			tty_debug(tty, "bad link = %p\n", o_tty->link);
+ 			return -1;
+ 		}
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ /**
+  *      tty_kclose      -       closes tty opened by tty_kopen
+  *      @tty: tty device
+  *
+  *      Performs the final steps to release and free a tty device. It is the
+  *      same as tty_release_struct except that it also resets TTY_PORT_KOPENED
+  *      flag on tty->port.
+  */
+ void tty_kclose(struct tty_struct *tty)
+ {
+ 	/*
+ 	 * Ask the line discipline code to release its structures
+ 	 */
+ 	tty_ldisc_release(tty);
+ 
+ 	/* Wait for pending work before tty destruction commmences */
+ 	tty_flush_works(tty);
+ 
+ 	tty_debug_hangup(tty, "freeing structure\n");
+ 	/*
+ 	 * The release_tty function takes care of the details of clearing
+ 	 * the slots and preserving the termios structure. The tty_unlock_pair
+ 	 * should be safe as we keep a kref while the tty is locked (so the
+ 	 * unlock never unlocks a freed tty).
+ 	 */
+ 	mutex_lock(&tty_mutex);
+ 	tty_port_set_kopened(tty->port, 0);
+ 	release_tty(tty, tty->index);
+ 	mutex_unlock(&tty_mutex);
+ }
+ EXPORT_SYMBOL_GPL(tty_kclose);
+ 
+ /**
+  *	tty_release_struct	-	release a tty struct
+  *	@tty: tty device
+  *	@idx: index of the tty
+  *
+  *	Performs the final steps to release and free a tty device. It is
+  *	roughly the reverse of tty_init_dev.
+  */
+ void tty_release_struct(struct tty_struct *tty, int idx)
+ {
+ 	/*
+ 	 * Ask the line discipline code to release its structures
+ 	 */
+ 	tty_ldisc_release(tty);
+ 
+ 	/* Wait for pending work before tty destruction commmences */
+ 	tty_flush_works(tty);
+ 
+ 	tty_debug_hangup(tty, "freeing structure\n");
+ 	/*
+ 	 * The release_tty function takes care of the details of clearing
+ 	 * the slots and preserving the termios structure. The tty_unlock_pair
+ 	 * should be safe as we keep a kref while the tty is locked (so the
+ 	 * unlock never unlocks a freed tty).
+ 	 */
+ 	mutex_lock(&tty_mutex);
+ 	release_tty(tty, idx);
+ 	mutex_unlock(&tty_mutex);
+ }
+ EXPORT_SYMBOL_GPL(tty_release_struct);
+ 
+ /**
+  *	tty_release		-	vfs callback for close
+  *	@inode: inode of tty
+  *	@filp: file pointer for handle to tty
+  *
+  *	Called the last time each file handle is closed that references
+  *	this tty. There may however be several such references.
+  *
+  *	Locking:
+  *		Takes bkl. See tty_release_dev
+  *
+  * Even releasing the tty structures is a tricky business.. We have
+  * to be very careful that the structures are all released at the
+  * same time, as interrupts might otherwise get the wrong pointers.
+  *
+  * WSH 09/09/97: rewritten to avoid some nasty race conditions that could
+  * lead to double frees or releasing memory still in use.
+  */
+ 
+ int tty_release(struct inode *inode, struct file *filp)
+ {
+ 	struct tty_struct *tty = file_tty(filp);
+ 	struct tty_struct *o_tty = NULL;
+ 	int	do_sleep, final;
+ 	int	idx;
+ 	long	timeout = 0;
+ 	int	once = 1;
+ 
+ 	if (tty_paranoia_check(tty, inode, __func__))
+ 		return 0;
+ 
+ 	tty_lock(tty);
+ 	check_tty_count(tty, __func__);
+ 
+ 	__tty_fasync(-1, filp, 0);
+ 
+ 	idx = tty->index;
+ 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+ 	    tty->driver->subtype == PTY_TYPE_MASTER)
+ 		o_tty = tty->link;
+ 
+ 	if (tty_release_checks(tty, idx)) {
+ 		tty_unlock(tty);
+ 		return 0;
+ 	}
+ 
+ 	tty_debug_hangup(tty, "releasing (count=%d)\n", tty->count);
+ 
+ 	if (tty->ops->close)
+ 		tty->ops->close(tty, filp);
+ 
+ 	/* If tty is pty master, lock the slave pty (stable lock order) */
+ 	tty_lock_slave(o_tty);
+ 
+ 	/*
+ 	 * Sanity check: if tty->count is going to zero, there shouldn't be
+ 	 * any waiters on tty->read_wait or tty->write_wait.  We test the
+ 	 * wait queues and kick everyone out _before_ actually starting to
+ 	 * close.  This ensures that we won't block while releasing the tty
+ 	 * structure.
+ 	 *
+ 	 * The test for the o_tty closing is necessary, since the master and
+ 	 * slave sides may close in any order.  If the slave side closes out
+ 	 * first, its count will be one, since the master side holds an open.
+ 	 * Thus this test wouldn't be triggered at the time the slave closed,
+ 	 * so we do it now.
+ 	 */
+ 	while (1) {
+ 		do_sleep = 0;
+ 
+ 		if (tty->count <= 1) {
+ 			if (waitqueue_active(&tty->read_wait)) {
+ 				wake_up_poll(&tty->read_wait, EPOLLIN);
+ 				do_sleep++;
+ 			}
+ 			if (waitqueue_active(&tty->write_wait)) {
+ 				wake_up_poll(&tty->write_wait, EPOLLOUT);
+ 				do_sleep++;
+ 			}
+ 		}
+ 		if (o_tty && o_tty->count <= 1) {
+ 			if (waitqueue_active(&o_tty->read_wait)) {
+ 				wake_up_poll(&o_tty->read_wait, EPOLLIN);
+ 				do_sleep++;
+ 			}
+ 			if (waitqueue_active(&o_tty->write_wait)) {
+ 				wake_up_poll(&o_tty->write_wait, EPOLLOUT);
+ 				do_sleep++;
+ 			}
+ 		}
+ 		if (!do_sleep)
+ 			break;
+ 
+ 		if (once) {
+ 			once = 0;
+ 			tty_warn(tty, "read/write wait queue active!\n");
+ 		}
+ 		schedule_timeout_killable(timeout);
+ 		if (timeout < 120 * HZ)
+ 			timeout = 2 * timeout + 1;
+ 		else
+ 			timeout = MAX_SCHEDULE_TIMEOUT;
+ 	}
+ 
+ 	if (o_tty) {
+ 		if (--o_tty->count < 0) {
+ 			tty_warn(tty, "bad slave count (%d)\n", o_tty->count);
+ 			o_tty->count = 0;
+ 		}
+ 	}
+ 	if (--tty->count < 0) {
+ 		tty_warn(tty, "bad tty->count (%d)\n", tty->count);
+ 		tty->count = 0;
+ 	}
+ 
+ 	/*
+ 	 * We've decremented tty->count, so we need to remove this file
+ 	 * descriptor off the tty->tty_files list; this serves two
+ 	 * purposes:
+ 	 *  - check_tty_count sees the correct number of file descriptors
+ 	 *    associated with this tty.
+ 	 *  - do_tty_hangup no longer sees this file descriptor as
+ 	 *    something that needs to be handled for hangups.
+ 	 */
+ 	tty_del_file(filp);
+ 
+ 	/*
+ 	 * Perform some housekeeping before deciding whether to return.
+ 	 *
+ 	 * If _either_ side is closing, make sure there aren't any
+ 	 * processes that still think tty or o_tty is their controlling
+ 	 * tty.
+ 	 */
+ 	if (!tty->count) {
+ 		read_lock(&tasklist_lock);
+ 		session_clear_tty(tty->session);
+ 		if (o_tty)
+ 			session_clear_tty(o_tty->session);
+ 		read_unlock(&tasklist_lock);
+ 	}
+ 
+ 	/* check whether both sides are closing ... */
+ 	final = !tty->count && !(o_tty && o_tty->count);
+ 
+ 	tty_unlock_slave(o_tty);
+ 	tty_unlock(tty);
+ 
+ 	/* At this point, the tty->count == 0 should ensure a dead tty
+ 	   cannot be re-opened by a racing opener */
+ 
+ 	if (!final)
+ 		return 0;
+ 
+ 	tty_debug_hangup(tty, "final close\n");
+ 
+ 	tty_release_struct(tty, idx);
+ 	return 0;
+ }
+ 
+ /**
+  *	tty_open_current_tty - get locked tty of current task
+  *	@device: device number
+  *	@filp: file pointer to tty
+  *	@return: locked tty of the current task iff @device is /dev/tty
+  *
+  *	Performs a re-open of the current task's controlling tty.
+  *
+  *	We cannot return driver and index like for the other nodes because
+  *	devpts will not work then. It expects inodes to be from devpts FS.
+  */
+ static struct tty_struct *tty_open_current_tty(dev_t device, struct file *filp)
+ {
+ 	struct tty_struct *tty;
+ 	int retval;
+ 
+ 	if (device != MKDEV(TTYAUX_MAJOR, 0))
+ 		return NULL;
+ 
+ 	tty = get_current_tty();
+ 	if (!tty)
+ 		return ERR_PTR(-ENXIO);
+ 
+ 	filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
+ 	/* noctty = 1; */
+ 	tty_lock(tty);
+ 	tty_kref_put(tty);	/* safe to drop the kref now */
+ 
+ 	retval = tty_reopen(tty);
+ 	if (retval < 0) {
+ 		tty_unlock(tty);
+ 		tty = ERR_PTR(retval);
+ 	}
+ 	return tty;
+ }
+ 
+ /**
+  *	tty_lookup_driver - lookup a tty driver for a given device file
+  *	@device: device number
+  *	@filp: file pointer to tty
+  *	@index: index for the device in the @return driver
+  *	@return: driver for this inode (with increased refcount)
+  *
+  * 	If @return is not erroneous, the caller is responsible to decrement the
+  * 	refcount by tty_driver_kref_put.
+  *
+  *	Locking: tty_mutex protects get_tty_driver
+  */
+ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
+ 		int *index)
+ {
+ 	struct tty_driver *driver = NULL;
+ 
+ 	switch (device) {
+ #ifdef CONFIG_VT
+ 	case MKDEV(TTY_MAJOR, 0): {
+ 		extern struct tty_driver *console_driver;
+ 		driver = tty_driver_kref_get(console_driver);
+ 		*index = fg_console;
+ 		break;
+ 	}
+ #endif
+ 	case MKDEV(TTYAUX_MAJOR, 1): {
+ 		struct tty_driver *console_driver = console_device(index);
+ 		if (console_driver) {
+ 			driver = tty_driver_kref_get(console_driver);
+ 			if (driver && filp) {
+ 				/* Don't let /dev/console block */
+ 				filp->f_flags |= O_NONBLOCK;
+ 				break;
+ 			}
+ 		}
+ 		if (driver)
+ 			tty_driver_kref_put(driver);
+ 		return ERR_PTR(-ENODEV);
+ 	}
+ 	default:
+ 		driver = get_tty_driver(device, index);
+ 		if (!driver)
+ 			return ERR_PTR(-ENODEV);
+ 		break;
+ 	}
+ 	return driver;
+ }
+ 
+ /**
+  *	tty_kopen	-	open a tty device for kernel
+  *	@device: dev_t of device to open
+  *
+  *	Opens tty exclusively for kernel. Performs the driver lookup,
+  *	makes sure it's not already opened and performs the first-time
+  *	tty initialization.
+  *
+  *	Returns the locked initialized &tty_struct
+  *
+  *	Claims the global tty_mutex to serialize:
+  *	  - concurrent first-time tty initialization
+  *	  - concurrent tty driver removal w/ lookup
+  *	  - concurrent tty removal from driver table
+  */
+ struct tty_struct *tty_kopen(dev_t device)
+ {
+ 	struct tty_struct *tty;
+ 	struct tty_driver *driver = NULL;
+ 	int index = -1;
+ 
+ 	mutex_lock(&tty_mutex);
+ 	driver = tty_lookup_driver(device, NULL, &index);
+ 	if (IS_ERR(driver)) {
+ 		mutex_unlock(&tty_mutex);
+ 		return ERR_CAST(driver);
+ 	}
+ 
+ 	/* check whether we're reopening an existing tty */
+ 	tty = tty_driver_lookup_tty(driver, NULL, index);
+ 	if (IS_ERR(tty))
+ 		goto out;
+ 
+ 	if (tty) {
+ 		/* drop kref from tty_driver_lookup_tty() */
+ 		tty_kref_put(tty);
+ 		tty = ERR_PTR(-EBUSY);
+ 	} else { /* tty_init_dev returns tty with the tty_lock held */
+ 		tty = tty_init_dev(driver, index);
+ 		if (IS_ERR(tty))
+ 			goto out;
+ 		tty_port_set_kopened(tty->port, 1);
+ 	}
+ out:
+ 	mutex_unlock(&tty_mutex);
+ 	tty_driver_kref_put(driver);
+ 	return tty;
+ }
+ EXPORT_SYMBOL_GPL(tty_kopen);
+ 
+ /**
+  *	tty_open_by_driver	-	open a tty device
+  *	@device: dev_t of device to open
+  *	@inode: inode of device file
+  *	@filp: file pointer to tty
+  *
+  *	Performs the driver lookup, checks for a reopen, or otherwise
+  *	performs the first-time tty initialization.
+  *
+  *	Returns the locked initialized or re-opened &tty_struct
+  *
+  *	Claims the global tty_mutex to serialize:
+  *	  - concurrent first-time tty initialization
+  *	  - concurrent tty driver removal w/ lookup
+  *	  - concurrent tty removal from driver table
+  */
+ static struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
+ 					     struct file *filp)
+ {
+ 	struct tty_struct *tty;
+ 	struct tty_driver *driver = NULL;
+ 	int index = -1;
+ 	int retval;
+ 
+ 	mutex_lock(&tty_mutex);
+ 	driver = tty_lookup_driver(device, filp, &index);
+ 	if (IS_ERR(driver)) {
+ 		mutex_unlock(&tty_mutex);
+ 		return ERR_CAST(driver);
+ 	}
+ 
+ 	/* check whether we're reopening an existing tty */
+ 	tty = tty_driver_lookup_tty(driver, filp, index);
+ 	if (IS_ERR(tty)) {
+ 		mutex_unlock(&tty_mutex);
+ 		goto out;
+ 	}
+ 
+ 	if (tty) {
+ 		if (tty_port_kopened(tty->port)) {
+ 			tty_kref_put(tty);
+ 			mutex_unlock(&tty_mutex);
+ 			tty = ERR_PTR(-EBUSY);
+ 			goto out;
+ 		}
+ 		mutex_unlock(&tty_mutex);
+ 		retval = tty_lock_interruptible(tty);
+ 		tty_kref_put(tty);  /* drop kref from tty_driver_lookup_tty() */
+ 		if (retval) {
+ 			if (retval == -EINTR)
+ 				retval = -ERESTARTSYS;
+ 			tty = ERR_PTR(retval);
+ 			goto out;
+ 		}
+ 		retval = tty_reopen(tty);
+ 		if (retval < 0) {
+ 			tty_unlock(tty);
+ 			tty = ERR_PTR(retval);
+ 		}
+ 	} else { /* Returns with the tty_lock held for now */
+ 		tty = tty_init_dev(driver, index);
+ 		mutex_unlock(&tty_mutex);
+ 	}
+ out:
+ 	tty_driver_kref_put(driver);
+ 	return tty;
+ }
+ 
+ /**
+  *	tty_open		-	open a tty device
+  *	@inode: inode of device file
+  *	@filp: file pointer to tty
+  *
+  *	tty_open and tty_release keep up the tty count that contains the
+  *	number of opens done on a tty. We cannot use the inode-count, as
+  *	different inodes might point to the same tty.
+  *
+  *	Open-counting is needed for pty masters, as well as for keeping
+  *	track of serial lines: DTR is dropped when the last close happens.
+  *	(This is not done solely through tty->count, now.  - Ted 1/27/92)
+  *
+  *	The termios state of a pty is reset on first open so that
+  *	settings don't persist across reuse.
+  *
+  *	Locking: tty_mutex protects tty, tty_lookup_driver and tty_init_dev.
+  *		 tty->count should protect the rest.
+  *		 ->siglock protects ->signal/->sighand
+  *
+  *	Note: the tty_unlock/lock cases without a ref are only safe due to
+  *	tty_mutex
+  */
+ 
+ static int tty_open(struct inode *inode, struct file *filp)
+ {
+ 	struct tty_struct *tty;
+ 	int noctty, retval;
+ 	dev_t device = inode->i_rdev;
+ 	unsigned saved_flags = filp->f_flags;
+ 
+ 	nonseekable_open(inode, filp);
+ 
+ retry_open:
+ 	retval = tty_alloc_file(filp);
+ 	if (retval)
+ 		return -ENOMEM;
+ 
+ 	tty = tty_open_current_tty(device, filp);
+ 	if (!tty)
+ 		tty = tty_open_by_driver(device, inode, filp);
+ 
+ 	if (IS_ERR(tty)) {
+ 		tty_free_file(filp);
+ 		retval = PTR_ERR(tty);
+ 		if (retval != -EAGAIN || signal_pending(current))
+ 			return retval;
+ 		schedule();
+ 		goto retry_open;
+ 	}
+ 
+ 	tty_add_file(tty, filp);
+ 
+ 	check_tty_count(tty, __func__);
+ 	tty_debug_hangup(tty, "opening (count=%d)\n", tty->count);
+ 
+ 	if (tty->ops->open)
+ 		retval = tty->ops->open(tty, filp);
+ 	else
+ 		retval = -ENODEV;
+ 	filp->f_flags = saved_flags;
+ 
+ 	if (retval) {
+ 		tty_debug_hangup(tty, "open error %d, releasing\n", retval);
+ 
+ 		tty_unlock(tty); /* need to call tty_release without BTM */
+ 		tty_release(inode, filp);
+ 		if (retval != -ERESTARTSYS)
+ 			return retval;
+ 
+ 		if (signal_pending(current))
+ 			return retval;
+ 
+ 		schedule();
+ 		/*
+ 		 * Need to reset f_op in case a hangup happened.
+ 		 */
+ 		if (tty_hung_up_p(filp))
+ 			filp->f_op = &tty_fops;
+ 		goto retry_open;
+ 	}
+ 	clear_bit(TTY_HUPPED, &tty->flags);
+ 
+ 	noctty = (filp->f_flags & O_NOCTTY) ||
+ 		 (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) ||
+ 		 device == MKDEV(TTYAUX_MAJOR, 1) ||
+ 		 (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+ 		  tty->driver->subtype == PTY_TYPE_MASTER);
+ 	if (!noctty)
+ 		tty_open_proc_set_tty(filp, tty);
+ 	tty_unlock(tty);
+ 	return 0;
+ }
+ 
+ 
+ 
+ /**
+  *	tty_poll	-	check tty status
+  *	@filp: file being polled
+  *	@wait: poll wait structures to update
+  *
+  *	Call the line discipline polling method to obtain the poll
+  *	status of the device.
+  *
+  *	Locking: locks called line discipline but ldisc poll method
+  *	may be re-entered freely by other callers.
+  */
+ 
+ static __poll_t tty_poll(struct file *filp, poll_table *wait)
+ {
+ 	struct tty_struct *tty = file_tty(filp);
+ 	struct tty_ldisc *ld;
+ 	__poll_t ret = 0;
+ 
+ 	if (tty_paranoia_check(tty, file_inode(filp), "tty_poll"))
+ 		return 0;
+ 
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return hung_up_tty_poll(filp, wait);
+ 	if (ld->ops->poll)
+ 		ret = ld->ops->poll(tty, filp, wait);
+ 	tty_ldisc_deref(ld);
+ 	return ret;
+ }
+ 
+ static int __tty_fasync(int fd, struct file *filp, int on)
+ {
+ 	struct tty_struct *tty = file_tty(filp);
+ 	unsigned long flags;
+ 	int retval = 0;
+ 
+ 	if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync"))
+ 		goto out;
+ 
+ 	retval = fasync_helper(fd, filp, on, &tty->fasync);
+ 	if (retval <= 0)
+ 		goto out;
+ 
+ 	if (on) {
+ 		enum pid_type type;
+ 		struct pid *pid;
+ 
+ 		spin_lock_irqsave(&tty->ctrl_lock, flags);
+ 		if (tty->pgrp) {
+ 			pid = tty->pgrp;
+ 			type = PIDTYPE_PGID;
+ 		} else {
+ 			pid = task_pid(current);
+ 			type = PIDTYPE_TGID;
+ 		}
+ 		get_pid(pid);
+ 		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+ 		__f_setown(filp, pid, type, 0);
+ 		put_pid(pid);
+ 		retval = 0;
+ 	}
+ out:
+ 	return retval;
+ }
+ 
+ static int tty_fasync(int fd, struct file *filp, int on)
+ {
+ 	struct tty_struct *tty = file_tty(filp);
+ 	int retval = -ENOTTY;
+ 
+ 	tty_lock(tty);
+ 	if (!tty_hung_up_p(filp))
+ 		retval = __tty_fasync(fd, filp, on);
+ 	tty_unlock(tty);
+ 
+ 	return retval;
+ }
+ 
+ /**
+  *	tiocsti			-	fake input character
+  *	@tty: tty to fake input into
+  *	@p: pointer to character
+  *
+  *	Fake input to a tty device. Does the necessary locking and
+  *	input management.
+  *
+  *	FIXME: does not honour flow control ??
+  *
+  *	Locking:
+  *		Called functions take tty_ldiscs_lock
+  *		current->signal->tty check is safe without locks
+  *
+  *	FIXME: may race normal receive processing
+  */
+ 
+ int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT);
+ 
+ static int tiocsti(struct tty_struct *tty, char __user *p)
+ {
+ 	char ch, mbz = 0;
+ 	struct tty_ldisc *ld;
+ 
+ 	if (tiocsti_restrict &&
+ 		!ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) {
+ 		dev_warn_ratelimited(tty->dev,
+ 			"Denied TIOCSTI ioctl for non-privileged process\n");
+ 		return -EPERM;
+ 	}
+ 	if (gr_handle_tiocsti(tty))
+ 		return -EPERM;
+ 	if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+ 	if (get_user(ch, p))
+ 		return -EFAULT;
+ 	tty_audit_tiocsti(tty, ch);
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return -EIO;
+ 	if (ld->ops->receive_buf)
+ 		ld->ops->receive_buf(tty, &ch, &mbz, 1);
+ 	tty_ldisc_deref(ld);
+ 	return 0;
+ }
+ 
+ /**
+  *	tiocgwinsz		-	implement window query ioctl
+  *	@tty; tty
+  *	@arg: user buffer for result
+  *
+  *	Copies the kernel idea of the window size into the user buffer.
+  *
+  *	Locking: tty->winsize_mutex is taken to ensure the winsize data
+  *		is consistent.
+  */
+ 
+ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user *arg)
+ {
+ 	int err;
+ 
+ 	mutex_lock(&tty->winsize_mutex);
+ 	err = copy_to_user(arg, &tty->winsize, sizeof(*arg));
+ 	mutex_unlock(&tty->winsize_mutex);
+ 
+ 	return err ? -EFAULT: 0;
+ }
+ 
+ /**
+  *	tty_do_resize		-	resize event
+  *	@tty: tty being resized
+  *	@rows: rows (character)
+  *	@cols: cols (character)
+  *
+  *	Update the termios variables and send the necessary signals to
+  *	peform a terminal resize correctly
+  */
+ 
+ int tty_do_resize(struct tty_struct *tty, struct winsize *ws)
+ {
+ 	struct pid *pgrp;
+ 
+ 	/* Lock the tty */
+ 	mutex_lock(&tty->winsize_mutex);
+ 	if (!memcmp(ws, &tty->winsize, sizeof(*ws)))
+ 		goto done;
+ 
+ 	/* Signal the foreground process group */
+ 	pgrp = tty_get_pgrp(tty);
+ 	if (pgrp)
+ 		kill_pgrp(pgrp, SIGWINCH, 1);
+ 	put_pid(pgrp);
+ 
+ 	tty->winsize = *ws;
+ done:
+ 	mutex_unlock(&tty->winsize_mutex);
+ 	return 0;
+ }
+ EXPORT_SYMBOL(tty_do_resize);
+ 
+ /**
+  *	tiocswinsz		-	implement window size set ioctl
+  *	@tty; tty side of tty
+  *	@arg: user buffer for result
+  *
+  *	Copies the user idea of the window size to the kernel. Traditionally
+  *	this is just advisory information but for the Linux console it
+  *	actually has driver level meaning and triggers a VC resize.
+  *
+  *	Locking:
+  *		Driver dependent. The default do_resize method takes the
+  *	tty termios mutex and ctrl_lock. The console takes its own lock
+  *	then calls into the default method.
+  */
+ 
+ static int tiocswinsz(struct tty_struct *tty, struct winsize __user *arg)
+ {
+ 	struct winsize tmp_ws;
+ 	if (copy_from_user(&tmp_ws, arg, sizeof(*arg)))
+ 		return -EFAULT;
+ 
+ 	if (tty->ops->resize)
+ 		return tty->ops->resize(tty, &tmp_ws);
+ 	else
+ 		return tty_do_resize(tty, &tmp_ws);
+ }
+ 
+ /**
+  *	tioccons	-	allow admin to move logical console
+  *	@file: the file to become console
+  *
+  *	Allow the administrator to move the redirected console device
+  *
+  *	Locking: uses redirect_lock to guard the redirect information
+  */
+ 
+ static int tioccons(struct file *file)
+ {
+ 	if (!capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+ 	if (file->f_op->write == redirected_tty_write) {
+ 		struct file *f;
+ 		spin_lock(&redirect_lock);
+ 		f = redirect;
+ 		redirect = NULL;
+ 		spin_unlock(&redirect_lock);
+ 		if (f)
+ 			fput(f);
+ 		return 0;
+ 	}
+ 	spin_lock(&redirect_lock);
+ 	if (redirect) {
+ 		spin_unlock(&redirect_lock);
+ 		return -EBUSY;
+ 	}
+ 	redirect = get_file(file);
+ 	spin_unlock(&redirect_lock);
+ 	return 0;
+ }
+ 
+ /**
+  *	tiocsetd	-	set line discipline
+  *	@tty: tty device
+  *	@p: pointer to user data
+  *
+  *	Set the line discipline according to user request.
+  *
+  *	Locking: see tty_set_ldisc, this function is just a helper
+  */
+ 
+ static int tiocsetd(struct tty_struct *tty, int __user *p)
+ {
+ 	int disc;
+ 	int ret;
+ 
+ 	if (get_user(disc, p))
+ 		return -EFAULT;
+ 
+ 	ret = tty_set_ldisc(tty, disc);
+ 
+ 	return ret;
+ }
+ 
+ /**
+  *	tiocgetd	-	get line discipline
+  *	@tty: tty device
+  *	@p: pointer to user data
+  *
+  *	Retrieves the line discipline id directly from the ldisc.
+  *
+  *	Locking: waits for ldisc reference (in case the line discipline
+  *		is changing or the tty is being hungup)
+  */
+ 
+ static int tiocgetd(struct tty_struct *tty, int __user *p)
+ {
+ 	struct tty_ldisc *ld;
+ 	int ret;
+ 
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return -EIO;
+ 	ret = put_user(ld->ops->num, p);
+ 	tty_ldisc_deref(ld);
+ 	return ret;
+ }
+ 
+ /**
+  *	send_break	-	performed time break
+  *	@tty: device to break on
+  *	@duration: timeout in mS
+  *
+  *	Perform a timed break on hardware that lacks its own driver level
+  *	timed break functionality.
+  *
+  *	Locking:
+  *		atomic_write_lock serializes
+  *
+  */
+ 
+ static int send_break(struct tty_struct *tty, unsigned int duration)
+ {
+ 	int retval;
+ 
+ 	if (tty->ops->break_ctl == NULL)
+ 		return 0;
+ 
+ 	if (tty->driver->flags & TTY_DRIVER_HARDWARE_BREAK)
+ 		retval = tty->ops->break_ctl(tty, duration);
+ 	else {
+ 		/* Do the work ourselves */
+ 		if (tty_write_lock(tty, 0) < 0)
+ 			return -EINTR;
+ 		retval = tty->ops->break_ctl(tty, -1);
+ 		if (retval)
+ 			goto out;
+ 		if (!signal_pending(current))
+ 			msleep_interruptible(duration);
+ 		retval = tty->ops->break_ctl(tty, 0);
+ out:
+ 		tty_write_unlock(tty);
+ 		if (signal_pending(current))
+ 			retval = -EINTR;
+ 	}
+ 	return retval;
+ }
+ 
+ /**
+  *	tty_tiocmget		-	get modem status
+  *	@tty: tty device
+  *	@file: user file pointer
+  *	@p: pointer to result
+  *
+  *	Obtain the modem status bits from the tty driver if the feature
+  *	is supported. Return -EINVAL if it is not available.
+  *
+  *	Locking: none (up to the driver)
+  */
+ 
+ static int tty_tiocmget(struct tty_struct *tty, int __user *p)
+ {
+ 	int retval = -EINVAL;
+ 
+ 	if (tty->ops->tiocmget) {
+ 		retval = tty->ops->tiocmget(tty);
+ 
+ 		if (retval >= 0)
+ 			retval = put_user(retval, p);
+ 	}
+ 	return retval;
+ }
+ 
+ /**
+  *	tty_tiocmset		-	set modem status
+  *	@tty: tty device
+  *	@cmd: command - clear bits, set bits or set all
+  *	@p: pointer to desired bits
+  *
+  *	Set the modem status bits from the tty driver if the feature
+  *	is supported. Return -EINVAL if it is not available.
+  *
+  *	Locking: none (up to the driver)
+  */
+ 
+ static int tty_tiocmset(struct tty_struct *tty, unsigned int cmd,
+ 	     unsigned __user *p)
+ {
+ 	int retval;
+ 	unsigned int set, clear, val;
+ 
+ 	if (tty->ops->tiocmset == NULL)
+ 		return -EINVAL;
+ 
+ 	retval = get_user(val, p);
+ 	if (retval)
+ 		return retval;
+ 	set = clear = 0;
+ 	switch (cmd) {
+ 	case TIOCMBIS:
+ 		set = val;
+ 		break;
+ 	case TIOCMBIC:
+ 		clear = val;
+ 		break;
+ 	case TIOCMSET:
+ 		set = val;
+ 		clear = ~val;
+ 		break;
+ 	}
+ 	set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP;
+ 	clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP;
+ 	return tty->ops->tiocmset(tty, set, clear);
+ }
+ 
+ static int tty_tiocgicount(struct tty_struct *tty, void __user *arg)
+ {
+ 	int retval = -EINVAL;
+ 	struct serial_icounter_struct icount;
+ 	memset(&icount, 0, sizeof(icount));
+ 	if (tty->ops->get_icount)
+ 		retval = tty->ops->get_icount(tty, &icount);
+ 	if (retval != 0)
+ 		return retval;
+ 	if (copy_to_user(arg, &icount, sizeof(icount)))
+ 		return -EFAULT;
+ 	return 0;
+ }
+ 
+ static int tty_tiocsserial(struct tty_struct *tty, struct serial_struct __user *ss)
+ {
+ 	static DEFINE_RATELIMIT_STATE(depr_flags,
+ 			DEFAULT_RATELIMIT_INTERVAL,
+ 			DEFAULT_RATELIMIT_BURST);
+ 	char comm[TASK_COMM_LEN];
+ 	struct serial_struct v;
+ 	int flags;
+ 
+ 	if (copy_from_user(&v, ss, sizeof(struct serial_struct)))
+ 		return -EFAULT;
+ 
+ 	flags = v.flags & ASYNC_DEPRECATED;
+ 
+ 	if (flags && __ratelimit(&depr_flags))
+ 		pr_warn("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n",
+ 			__func__, get_task_comm(comm, current), flags);
+ 	if (!tty->ops->set_serial)
+ 		return -ENOTTY;
+ 	return tty->ops->set_serial(tty, &v);
+ }
+ 
+ static int tty_tiocgserial(struct tty_struct *tty, struct serial_struct __user *ss)
+ {
+ 	struct serial_struct v;
+ 	int err;
+ 
+ 	memset(&v, 0, sizeof(struct serial_struct));
+ 	if (!tty->ops->get_serial)
+ 		return -ENOTTY;
+ 	err = tty->ops->get_serial(tty, &v);
+ 	if (!err && copy_to_user(ss, &v, sizeof(struct serial_struct)))
+ 		err = -EFAULT;
+ 	return err;
+ }
+ 
+ /*
+  * if pty, return the slave side (real_tty)
+  * otherwise, return self
+  */
+ static struct tty_struct *tty_pair_get_tty(struct tty_struct *tty)
+ {
+ 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+ 	    tty->driver->subtype == PTY_TYPE_MASTER)
+ 		tty = tty->link;
+ 	return tty;
+ }
+ 
+ /*
+  * Split this up, as gcc can choke on it otherwise..
+  */
+ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+ {
+ 	struct tty_struct *tty = file_tty(file);
+ 	struct tty_struct *real_tty;
+ 	void __user *p = (void __user *)arg;
+ 	int retval;
+ 	struct tty_ldisc *ld;
+ 
+ 	if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl"))
+ 		return -EINVAL;
+ 
+ 	real_tty = tty_pair_get_tty(tty);
+ 
+ 	/*
+ 	 * Factor out some common prep work
+ 	 */
+ 	switch (cmd) {
+ 	case TIOCSETD:
+ 	case TIOCSBRK:
+ 	case TIOCCBRK:
+ 	case TCSBRK:
+ 	case TCSBRKP:
+ 		retval = tty_check_change(tty);
+ 		if (retval)
+ 			return retval;
+ 		if (cmd != TIOCCBRK) {
+ 			tty_wait_until_sent(tty, 0);
+ 			if (signal_pending(current))
+ 				return -EINTR;
+ 		}
+ 		break;
+ 	}
+ 
+ 	/*
+ 	 *	Now do the stuff.
+ 	 */
+ 	switch (cmd) {
+ 	case TIOCSTI:
+ 		return tiocsti(tty, p);
+ 	case TIOCGWINSZ:
+ 		return tiocgwinsz(real_tty, p);
+ 	case TIOCSWINSZ:
+ 		return tiocswinsz(real_tty, p);
+ 	case TIOCCONS:
+ 		return real_tty != tty ? -EINVAL : tioccons(file);
+ 	case TIOCEXCL:
+ 		set_bit(TTY_EXCLUSIVE, &tty->flags);
+ 		return 0;
+ 	case TIOCNXCL:
+ 		clear_bit(TTY_EXCLUSIVE, &tty->flags);
+ 		return 0;
+ 	case TIOCGEXCL:
+ 	{
+ 		int excl = test_bit(TTY_EXCLUSIVE, &tty->flags);
+ 		return put_user(excl, (int __user *)p);
+ 	}
+ 	case TIOCGETD:
+ 		return tiocgetd(tty, p);
+ 	case TIOCSETD:
+ 		return tiocsetd(tty, p);
+ 	case TIOCVHANGUP:
+ 		if (!capable(CAP_SYS_ADMIN))
+ 			return -EPERM;
+ 		tty_vhangup(tty);
+ 		return 0;
+ 	case TIOCGDEV:
+ 	{
+ 		unsigned int ret = new_encode_dev(tty_devnum(real_tty));
+ 		return put_user(ret, (unsigned int __user *)p);
+ 	}
+ 	/*
+ 	 * Break handling
+ 	 */
+ 	case TIOCSBRK:	/* Turn break on, unconditionally */
+ 		if (tty->ops->break_ctl)
+ 			return tty->ops->break_ctl(tty, -1);
+ 		return 0;
+ 	case TIOCCBRK:	/* Turn break off, unconditionally */
+ 		if (tty->ops->break_ctl)
+ 			return tty->ops->break_ctl(tty, 0);
+ 		return 0;
+ 	case TCSBRK:   /* SVID version: non-zero arg --> no break */
+ 		/* non-zero arg means wait for all output data
+ 		 * to be sent (performed above) but don't send break.
+ 		 * This is used by the tcdrain() termios function.
+ 		 */
+ 		if (!arg)
+ 			return send_break(tty, 250);
+ 		return 0;
+ 	case TCSBRKP:	/* support for POSIX tcsendbreak() */
+ 		return send_break(tty, arg ? arg*100 : 250);
+ 
+ 	case TIOCMGET:
+ 		return tty_tiocmget(tty, p);
+ 	case TIOCMSET:
+ 	case TIOCMBIC:
+ 	case TIOCMBIS:
+ 		return tty_tiocmset(tty, cmd, p);
+ 	case TIOCGICOUNT:
+ 		return tty_tiocgicount(tty, p);
+ 	case TCFLSH:
+ 		switch (arg) {
+ 		case TCIFLUSH:
+ 		case TCIOFLUSH:
+ 		/* flush tty buffer and allow ldisc to process ioctl */
+ 			tty_buffer_flush(tty, NULL);
+ 			break;
+ 		}
+ 		break;
+ 	case TIOCSSERIAL:
+ 		return tty_tiocsserial(tty, p);
+ 	case TIOCGSERIAL:
+ 		return tty_tiocgserial(tty, p);
+ 	case TIOCGPTPEER:
+ 		/* Special because the struct file is needed */
+ 		return ptm_open_peer(file, tty, (int)arg);
+ 	default:
+ 		retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg);
+ 		if (retval != -ENOIOCTLCMD)
+ 			return retval;
+ 	}
+ 	if (tty->ops->ioctl) {
+ 		retval = tty->ops->ioctl(tty, cmd, arg);
+ 		if (retval != -ENOIOCTLCMD)
+ 			return retval;
+ 	}
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return hung_up_tty_ioctl(file, cmd, arg);
+ 	retval = -EINVAL;
+ 	if (ld->ops->ioctl) {
+ 		retval = ld->ops->ioctl(tty, file, cmd, arg);
+ 		if (retval == -ENOIOCTLCMD)
+ 			retval = -ENOTTY;
+ 	}
+ 	tty_ldisc_deref(ld);
+ 	return retval;
+ }
+ 
+ #ifdef CONFIG_COMPAT
+ 
+ struct serial_struct32 {
+         compat_int_t    type;
+         compat_int_t    line;
+         compat_uint_t   port;
+         compat_int_t    irq;
+         compat_int_t    flags;
+         compat_int_t    xmit_fifo_size;
+         compat_int_t    custom_divisor;
+         compat_int_t    baud_base;
+         unsigned short  close_delay;
+         char    io_type;
+         char    reserved_char[1];
+         compat_int_t    hub6;
+         unsigned short  closing_wait; /* time to wait before closing */
+         unsigned short  closing_wait2; /* no longer used... */
+         compat_uint_t   iomem_base;
+         unsigned short  iomem_reg_shift;
+         unsigned int    port_high;
+      /* compat_ulong_t  iomap_base FIXME */
+         compat_int_t    reserved[1];
+ };
+ 
+ static int compat_tty_tiocsserial(struct tty_struct *tty,
+ 		struct serial_struct32 __user *ss)
+ {
+ 	static DEFINE_RATELIMIT_STATE(depr_flags,
+ 			DEFAULT_RATELIMIT_INTERVAL,
+ 			DEFAULT_RATELIMIT_BURST);
+ 	char comm[TASK_COMM_LEN];
+ 	struct serial_struct32 v32;
+ 	struct serial_struct v;
+ 	int flags;
+ 
+ 	if (copy_from_user(&v32, ss, sizeof(struct serial_struct32)))
+ 		return -EFAULT;
+ 
+ 	memcpy(&v, &v32, offsetof(struct serial_struct32, iomem_base));
+ 	v.iomem_base = compat_ptr(v32.iomem_base);
+ 	v.iomem_reg_shift = v32.iomem_reg_shift;
+ 	v.port_high = v32.port_high;
+ 	v.iomap_base = 0;
+ 
+ 	flags = v.flags & ASYNC_DEPRECATED;
+ 
+ 	if (flags && __ratelimit(&depr_flags))
+ 		pr_warn("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n",
+ 			__func__, get_task_comm(comm, current), flags);
+ 	if (!tty->ops->set_serial)
+ 		return -ENOTTY;
+ 	return tty->ops->set_serial(tty, &v);
+ }
+ 
+ static int compat_tty_tiocgserial(struct tty_struct *tty,
+ 			struct serial_struct32 __user *ss)
+ {
+ 	struct serial_struct32 v32;
+ 	struct serial_struct v;
+ 	int err;
+ 
+ 	memset(&v, 0, sizeof(v));
+ 	memset(&v32, 0, sizeof(v32));
+ 
+ 	if (!tty->ops->get_serial)
+ 		return -ENOTTY;
+ 	err = tty->ops->get_serial(tty, &v);
+ 	if (!err) {
+ 		memcpy(&v32, &v, offsetof(struct serial_struct32, iomem_base));
+ 		v32.iomem_base = (unsigned long)v.iomem_base >> 32 ?
+ 			0xfffffff : ptr_to_compat(v.iomem_base);
+ 		v32.iomem_reg_shift = v.iomem_reg_shift;
+ 		v32.port_high = v.port_high;
+ 		if (copy_to_user(ss, &v32, sizeof(struct serial_struct32)))
+ 			err = -EFAULT;
+ 	}
+ 	return err;
+ }
+ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
+ 				unsigned long arg)
+ {
+ 	struct tty_struct *tty = file_tty(file);
+ 	struct tty_ldisc *ld;
+ 	int retval = -ENOIOCTLCMD;
+ 
+ 	switch (cmd) {
+ 	case TIOCSTI:
+ 	case TIOCGWINSZ:
+ 	case TIOCSWINSZ:
+ 	case TIOCGEXCL:
+ 	case TIOCGETD:
+ 	case TIOCSETD:
+ 	case TIOCGDEV:
+ 	case TIOCMGET:
+ 	case TIOCMSET:
+ 	case TIOCMBIC:
+ 	case TIOCMBIS:
+ 	case TIOCGICOUNT:
+ 	case TIOCGPGRP:
+ 	case TIOCSPGRP:
+ 	case TIOCGSID:
+ 	case TIOCSERGETLSR:
+ 	case TIOCGRS485:
+ 	case TIOCSRS485:
+ #ifdef TIOCGETP
+ 	case TIOCGETP:
+ 	case TIOCSETP:
+ 	case TIOCSETN:
+ #endif
+ #ifdef TIOCGETC
+ 	case TIOCGETC:
+ 	case TIOCSETC:
+ #endif
+ #ifdef TIOCGLTC
+ 	case TIOCGLTC:
+ 	case TIOCSLTC:
+ #endif
+ 	case TCSETSF:
+ 	case TCSETSW:
+ 	case TCSETS:
+ 	case TCGETS:
+ #ifdef TCGETS2
+ 	case TCGETS2:
+ 	case TCSETSF2:
+ 	case TCSETSW2:
+ 	case TCSETS2:
+ #endif
+ 	case TCGETA:
+ 	case TCSETAF:
+ 	case TCSETAW:
+ 	case TCSETA:
+ 	case TIOCGLCKTRMIOS:
+ 	case TIOCSLCKTRMIOS:
+ #ifdef TCGETX
+ 	case TCGETX:
+ 	case TCSETX:
+ 	case TCSETXW:
+ 	case TCSETXF:
+ #endif
+ 	case TIOCGSOFTCAR:
+ 	case TIOCSSOFTCAR:
+ 		return tty_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+ 	case TIOCCONS:
+ 	case TIOCEXCL:
+ 	case TIOCNXCL:
+ 	case TIOCVHANGUP:
+ 	case TIOCSBRK:
+ 	case TIOCCBRK:
+ 	case TCSBRK:
+ 	case TCSBRKP:
+ 	case TCFLSH:
+ 	case TIOCGPTPEER:
+ 	case TIOCNOTTY:
+ 	case TIOCSCTTY:
+ 	case TCXONC:
+ 	case TIOCMIWAIT:
+ 	case TIOCSERCONFIG:
+ 		return tty_ioctl(file, cmd, arg);
+ 	}
+ 
+ 	if (tty_paranoia_check(tty, file_inode(file), "tty_ioctl"))
+ 		return -EINVAL;
+ 
+ 	switch (cmd) {
+ 	case TIOCSSERIAL:
+ 		return compat_tty_tiocsserial(tty, compat_ptr(arg));
+ 	case TIOCGSERIAL:
+ 		return compat_tty_tiocgserial(tty, compat_ptr(arg));
+ 	}
+ 	if (tty->ops->compat_ioctl) {
+ 		retval = tty->ops->compat_ioctl(tty, cmd, arg);
+ 		if (retval != -ENOIOCTLCMD)
+ 			return retval;
+ 	}
+ 
+ 	ld = tty_ldisc_ref_wait(tty);
+ 	if (!ld)
+ 		return hung_up_tty_compat_ioctl(file, cmd, arg);
+ 	if (ld->ops->compat_ioctl)
+ 		retval = ld->ops->compat_ioctl(tty, file, cmd, arg);
+ 	if (retval == -ENOIOCTLCMD && ld->ops->ioctl)
+ 		retval = ld->ops->ioctl(tty, file,
+ 				(unsigned long)compat_ptr(cmd), arg);
+ 	tty_ldisc_deref(ld);
+ 
+ 	return retval;
+ }
+ #endif
+ 
+ static int this_tty(const void *t, struct file *file, unsigned fd)
+ {
+ 	if (likely(file->f_op->read != tty_read))
+ 		return 0;
+ 	return file_tty(file) != t ? 0 : fd + 1;
+ }
+ 	
+ /*
+  * This implements the "Secure Attention Key" ---  the idea is to
+  * prevent trojan horses by killing all processes associated with this
+  * tty when the user hits the "Secure Attention Key".  Required for
+  * super-paranoid applications --- see the Orange Book for more details.
+  *
+  * This code could be nicer; ideally it should send a HUP, wait a few
+  * seconds, then send a INT, and then a KILL signal.  But you then
+  * have to coordinate with the init process, since all processes associated
+  * with the current tty must be dead before the new getty is allowed
+  * to spawn.
+  *
+  * Now, if it would be correct ;-/ The current code has a nasty hole -
+  * it doesn't catch files in flight. We may send the descriptor to ourselves
+  * via AF_UNIX socket, close it and later fetch from socket. FIXME.
+  *
+  * Nasty bug: do_SAK is being called in interrupt context.  This can
+  * deadlock.  We punt it up to process context.  AKPM - 16Mar2001
+  */
+ void __do_SAK(struct tty_struct *tty)
+ {
+ #ifdef TTY_SOFT_SAK
+ 	tty_hangup(tty);
+ #else
+ 	struct task_struct *g, *p;
+ 	struct pid *session;
+ 	int		i;
+ 	unsigned long flags;
+ 
+ 	if (!tty)
+ 		return;
+ 
+ 	spin_lock_irqsave(&tty->ctrl_lock, flags);
+ 	session = get_pid(tty->session);
+ 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+ 
+ 	tty_ldisc_flush(tty);
+ 
+ 	tty_driver_flush_buffer(tty);
+ 
+ 	read_lock(&tasklist_lock);
+ 	/* Kill the entire session */
+ 	do_each_pid_task(session, PIDTYPE_SID, p) {
+ 		tty_notice(tty, "SAK: killed process %d (%s): by session\n",
+ 			   task_pid_nr(p), p->comm);
+ 		group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID);
+ 	} while_each_pid_task(session, PIDTYPE_SID, p);
+ 
+ 	/* Now kill any processes that happen to have the tty open */
+ 	do_each_thread(g, p) {
+ 		if (p->signal->tty == tty) {
+ 			tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n",
+ 				   task_pid_nr(p), p->comm);
+ 			group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID);
+ 			continue;
+ 		}
+ 		task_lock(p);
+ 		i = iterate_fd(p->files, 0, this_tty, tty);
+ 		if (i != 0) {
+ 			tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n",
+ 				   task_pid_nr(p), p->comm, i - 1);
+ 			group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID);
+ 		}
+ 		task_unlock(p);
+ 	} while_each_thread(g, p);
+ 	read_unlock(&tasklist_lock);
+ 	put_pid(session);
+ #endif
+ }
+ 
+ static void do_SAK_work(struct work_struct *work)
+ {
+ 	struct tty_struct *tty =
+ 		container_of(work, struct tty_struct, SAK_work);
+ 	__do_SAK(tty);
+ }
+ 
+ /*
+  * The tq handling here is a little racy - tty->SAK_work may already be queued.
+  * Fortunately we don't need to worry, because if ->SAK_work is already queued,
+  * the values which we write to it will be identical to the values which it
+  * already has. --akpm
+  */
+ void do_SAK(struct tty_struct *tty)
+ {
+ 	if (!tty)
+ 		return;
+ 	schedule_work(&tty->SAK_work);
+ }
+ 
+ EXPORT_SYMBOL(do_SAK);
+ 
+ /* Must put_device() after it's unused! */
+ static struct device *tty_get_device(struct tty_struct *tty)
+ {
+ 	dev_t devt = tty_devnum(tty);
+ 	return class_find_device_by_devt(tty_class, devt);
+ }
+ 
+ 
+ /**
+  *	alloc_tty_struct
+  *
+  *	This subroutine allocates and initializes a tty structure.
+  *
+  *	Locking: none - tty in question is not exposed at this point
+  */
+ 
+ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
+ {
+ 	struct tty_struct *tty;
+ 
+ 	tty = kzalloc(sizeof(*tty), GFP_KERNEL);
+ 	if (!tty)
+ 		return NULL;
+ 
+ 	kref_init(&tty->kref);
+ 	tty->magic = TTY_MAGIC;
+ 	if (tty_ldisc_init(tty)) {
+ 		kfree(tty);
+ 		return NULL;
+ 	}
+ 	tty->session = NULL;
+ 	tty->pgrp = NULL;
+ 	mutex_init(&tty->legacy_mutex);
+ 	mutex_init(&tty->throttle_mutex);
+ 	init_rwsem(&tty->termios_rwsem);
+ 	mutex_init(&tty->winsize_mutex);
+ 	init_ldsem(&tty->ldisc_sem);
+ 	init_waitqueue_head(&tty->write_wait);
+ 	init_waitqueue_head(&tty->read_wait);
+ 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
+ 	mutex_init(&tty->atomic_write_lock);
+ 	spin_lock_init(&tty->ctrl_lock);
+ 	spin_lock_init(&tty->flow_lock);
+ 	spin_lock_init(&tty->files_lock);
+ 	INIT_LIST_HEAD(&tty->tty_files);
+ 	INIT_WORK(&tty->SAK_work, do_SAK_work);
+ 
+ 	tty->driver = driver;
+ 	tty->ops = driver->ops;
+ 	tty->index = idx;
+ 	tty_line_name(driver, idx, tty->name);
+ 	tty->dev = tty_get_device(tty);
+ 	tty->owner_user_ns = get_user_ns(current_user_ns());
+ 
+ 	return tty;
+ }
+ 
+ /**
+  *	tty_put_char	-	write one character to a tty
+  *	@tty: tty
+  *	@ch: character
+  *
+  *	Write one byte to the tty using the provided put_char method
+  *	if present. Returns the number of characters successfully output.
+  *
+  *	Note: the specific put_char operation in the driver layer may go
+  *	away soon. Don't call it directly, use this method
+  */
+ 
+ int tty_put_char(struct tty_struct *tty, unsigned char ch)
+ {
+ 	if (tty->ops->put_char)
+ 		return tty->ops->put_char(tty, ch);
+ 	return tty->ops->write(tty, &ch, 1);
+ }
+ EXPORT_SYMBOL_GPL(tty_put_char);
+ 
+ struct class *tty_class;
+ 
+ static int tty_cdev_add(struct tty_driver *driver, dev_t dev,
+ 		unsigned int index, unsigned int count)
+ {
+ 	int err;
+ 
+ 	/* init here, since reused cdevs cause crashes */
+ 	driver->cdevs[index] = cdev_alloc();
+ 	if (!driver->cdevs[index])
+ 		return -ENOMEM;
+ 	driver->cdevs[index]->ops = &tty_fops;
+ 	driver->cdevs[index]->owner = driver->owner;
+ 	err = cdev_add(driver->cdevs[index], dev, count);
+ 	if (err)
+ 		kobject_put(&driver->cdevs[index]->kobj);
+ 	return err;
+ }
+ 
+ /**
+  *	tty_register_device - register a tty device
+  *	@driver: the tty driver that describes the tty device
+  *	@index: the index in the tty driver for this tty device
+  *	@device: a struct device that is associated with this tty device.
+  *		This field is optional, if there is no known struct device
+  *		for this tty device it can be set to NULL safely.
+  *
+  *	Returns a pointer to the struct device for this tty device
+  *	(or ERR_PTR(-EFOO) on error).
+  *
+  *	This call is required to be made to register an individual tty device
+  *	if the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set.  If
+  *	that bit is not set, this function should not be called by a tty
+  *	driver.
+  *
+  *	Locking: ??
+  */
+ 
+ struct device *tty_register_device(struct tty_driver *driver, unsigned index,
+ 				   struct device *device)
+ {
+ 	return tty_register_device_attr(driver, index, device, NULL, NULL);
+ }
+ EXPORT_SYMBOL(tty_register_device);
+ 
+ static void tty_device_create_release(struct device *dev)
+ {
+ 	dev_dbg(dev, "releasing...\n");
+ 	kfree(dev);
+ }
+ 
+ /**
+  *	tty_register_device_attr - register a tty device
+  *	@driver: the tty driver that describes the tty device
+  *	@index: the index in the tty driver for this tty device
+  *	@device: a struct device that is associated with this tty device.
+  *		This field is optional, if there is no known struct device
+  *		for this tty device it can be set to NULL safely.
+  *	@drvdata: Driver data to be set to device.
+  *	@attr_grp: Attribute group to be set on device.
+  *
+  *	Returns a pointer to the struct device for this tty device
+  *	(or ERR_PTR(-EFOO) on error).
+  *
+  *	This call is required to be made to register an individual tty device
+  *	if the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set.  If
+  *	that bit is not set, this function should not be called by a tty
+  *	driver.
+  *
+  *	Locking: ??
+  */
+ struct device *tty_register_device_attr(struct tty_driver *driver,
+ 				   unsigned index, struct device *device,
+ 				   void *drvdata,
+ 				   const struct attribute_group **attr_grp)
+ {
+ 	char name[64];
+ 	dev_t devt = MKDEV(driver->major, driver->minor_start) + index;
+ 	struct ktermios *tp;
+ 	struct device *dev;
+ 	int retval;
+ 
+ 	if (index >= driver->num) {
+ 		pr_err("%s: Attempt to register invalid tty line number (%d)\n",
+ 		       driver->name, index);
+ 		return ERR_PTR(-EINVAL);
+ 	}
+ 
+ 	if (driver->type == TTY_DRIVER_TYPE_PTY)
+ 		pty_line_name(driver, index, name);
+ 	else
+ 		tty_line_name(driver, index, name);
+ 
+ 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ 	if (!dev)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	dev->devt = devt;
+ 	dev->class = tty_class;
+ 	dev->parent = device;
+ 	dev->release = tty_device_create_release;
+ 	dev_set_name(dev, "%s", name);
+ 	dev->groups = attr_grp;
+ 	dev_set_drvdata(dev, drvdata);
+ 
+ 	dev_set_uevent_suppress(dev, 1);
+ 
+ 	retval = device_register(dev);
+ 	if (retval)
+ 		goto err_put;
+ 
+ 	if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) {
+ 		/*
+ 		 * Free any saved termios data so that the termios state is
+ 		 * reset when reusing a minor number.
+ 		 */
+ 		tp = driver->termios[index];
+ 		if (tp) {
+ 			driver->termios[index] = NULL;
+ 			kfree(tp);
+ 		}
+ 
+ 		retval = tty_cdev_add(driver, devt, index, 1);
+ 		if (retval)
+ 			goto err_del;
+ 	}
+ 
+ 	dev_set_uevent_suppress(dev, 0);
+ 	kobject_uevent(&dev->kobj, KOBJ_ADD);
+ 
+ 	return dev;
+ 
+ err_del:
+ 	device_del(dev);
+ err_put:
+ 	put_device(dev);
+ 
+ 	return ERR_PTR(retval);
+ }
+ EXPORT_SYMBOL_GPL(tty_register_device_attr);
+ 
+ /**
+  * 	tty_unregister_device - unregister a tty device
+  * 	@driver: the tty driver that describes the tty device
+  * 	@index: the index in the tty driver for this tty device
+  *
+  * 	If a tty device is registered with a call to tty_register_device() then
+  *	this function must be called when the tty device is gone.
+  *
+  *	Locking: ??
+  */
+ 
+ void tty_unregister_device(struct tty_driver *driver, unsigned index)
+ {
+ 	device_destroy(tty_class,
+ 		MKDEV(driver->major, driver->minor_start) + index);
+ 	if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) {
+ 		cdev_del(driver->cdevs[index]);
+ 		driver->cdevs[index] = NULL;
+ 	}
+ }
+ EXPORT_SYMBOL(tty_unregister_device);
+ 
+ /**
+  * __tty_alloc_driver -- allocate tty driver
+  * @lines: count of lines this driver can handle at most
+  * @owner: module which is responsible for this driver
+  * @flags: some of TTY_DRIVER_* flags, will be set in driver->flags
+  *
+  * This should not be called directly, some of the provided macros should be
+  * used instead. Use IS_ERR and friends on @retval.
+  */
+ struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner,
+ 		unsigned long flags)
+ {
+ 	struct tty_driver *driver;
+ 	unsigned int cdevs = 1;
+ 	int err;
+ 
+ 	if (!lines || (flags & TTY_DRIVER_UNNUMBERED_NODE && lines > 1))
+ 		return ERR_PTR(-EINVAL);
+ 
+ 	driver = kzalloc(sizeof(struct tty_driver), GFP_KERNEL);
+ 	if (!driver)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	kref_init(&driver->kref);
+ 	driver->magic = TTY_DRIVER_MAGIC;
+ 	driver->num = lines;
+ 	driver->owner = owner;
+ 	driver->flags = flags;
+ 
+ 	if (!(flags & TTY_DRIVER_DEVPTS_MEM)) {
+ 		driver->ttys = kcalloc(lines, sizeof(*driver->ttys),
+ 				GFP_KERNEL);
+ 		driver->termios = kcalloc(lines, sizeof(*driver->termios),
+ 				GFP_KERNEL);
+ 		if (!driver->ttys || !driver->termios) {
+ 			err = -ENOMEM;
+ 			goto err_free_all;
+ 		}
+ 	}
+ 
+ 	if (!(flags & TTY_DRIVER_DYNAMIC_ALLOC)) {
+ 		driver->ports = kcalloc(lines, sizeof(*driver->ports),
+ 				GFP_KERNEL);
+ 		if (!driver->ports) {
+ 			err = -ENOMEM;
+ 			goto err_free_all;
+ 		}
+ 		cdevs = lines;
+ 	}
+ 
+ 	driver->cdevs = kcalloc(cdevs, sizeof(*driver->cdevs), GFP_KERNEL);
+ 	if (!driver->cdevs) {
+ 		err = -ENOMEM;
+ 		goto err_free_all;
+ 	}
+ 
+ 	return driver;
+ err_free_all:
+ 	kfree(driver->ports);
+ 	kfree(driver->ttys);
+ 	kfree(driver->termios);
+ 	kfree(driver->cdevs);
+ 	kfree(driver);
+ 	return ERR_PTR(err);
+ }
+ EXPORT_SYMBOL(__tty_alloc_driver);
+ 
+ static void destruct_tty_driver(struct kref *kref)
+ {
+ 	struct tty_driver *driver = container_of(kref, struct tty_driver, kref);
+ 	int i;
+ 	struct ktermios *tp;
+ 
+ 	if (driver->flags & TTY_DRIVER_INSTALLED) {
+ 		for (i = 0; i < driver->num; i++) {
+ 			tp = driver->termios[i];
+ 			if (tp) {
+ 				driver->termios[i] = NULL;
+ 				kfree(tp);
+ 			}
+ 			if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV))
+ 				tty_unregister_device(driver, i);
+ 		}
+ 		proc_tty_unregister_driver(driver);
+ 		if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)
+ 			cdev_del(driver->cdevs[0]);
+ 	}
+ 	kfree(driver->cdevs);
+ 	kfree(driver->ports);
+ 	kfree(driver->termios);
+ 	kfree(driver->ttys);
+ 	kfree(driver);
+ }
+ 
+ void tty_driver_kref_put(struct tty_driver *driver)
+ {
+ 	kref_put(&driver->kref, destruct_tty_driver);
+ }
+ EXPORT_SYMBOL(tty_driver_kref_put);
+ 
+ void tty_set_operations(struct tty_driver *driver,
+ 			const struct tty_operations *op)
+ {
+ 	driver->ops = op;
+ };
+ EXPORT_SYMBOL(tty_set_operations);
+ 
+ void put_tty_driver(struct tty_driver *d)
+ {
+ 	tty_driver_kref_put(d);
+ }
+ EXPORT_SYMBOL(put_tty_driver);
+ 
+ /*
+  * Called by a tty driver to register itself.
+  */
+ int tty_register_driver(struct tty_driver *driver)
+ {
+ 	int error;
+ 	int i;
+ 	dev_t dev;
+ 	struct device *d;
+ 
+ 	if (!driver->major) {
+ 		error = alloc_chrdev_region(&dev, driver->minor_start,
+ 						driver->num, driver->name);
+ 		if (!error) {
+ 			driver->major = MAJOR(dev);
+ 			driver->minor_start = MINOR(dev);
+ 		}
+ 	} else {
+ 		dev = MKDEV(driver->major, driver->minor_start);
+ 		error = register_chrdev_region(dev, driver->num, driver->name);
+ 	}
+ 	if (error < 0)
+ 		goto err;
+ 
+ 	if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) {
+ 		error = tty_cdev_add(driver, dev, 0, driver->num);
+ 		if (error)
+ 			goto err_unreg_char;
+ 	}
+ 
+ 	mutex_lock(&tty_mutex);
+ 	list_add(&driver->tty_drivers, &tty_drivers);
+ 	mutex_unlock(&tty_mutex);
+ 
+ 	if (!(driver->flags & TTY_DRIVER_DYNAMIC_DEV)) {
+ 		for (i = 0; i < driver->num; i++) {
+ 			d = tty_register_device(driver, i, NULL);
+ 			if (IS_ERR(d)) {
+ 				error = PTR_ERR(d);
+ 				goto err_unreg_devs;
+ 			}
+ 		}
+ 	}
+ 	proc_tty_register_driver(driver);
+ 	driver->flags |= TTY_DRIVER_INSTALLED;
+ 	return 0;
+ 
+ err_unreg_devs:
+ 	for (i--; i >= 0; i--)
+ 		tty_unregister_device(driver, i);
+ 
+ 	mutex_lock(&tty_mutex);
+ 	list_del(&driver->tty_drivers);
+ 	mutex_unlock(&tty_mutex);
+ 
+ err_unreg_char:
+ 	unregister_chrdev_region(dev, driver->num);
+ err:
+ 	return error;
+ }
+ EXPORT_SYMBOL(tty_register_driver);
+ 
+ /*
+  * Called by a tty driver to unregister itself.
+  */
+ int tty_unregister_driver(struct tty_driver *driver)
+ {
+ #if 0
+ 	/* FIXME */
+ 	if (driver->refcount)
+ 		return -EBUSY;
+ #endif
+ 	unregister_chrdev_region(MKDEV(driver->major, driver->minor_start),
+ 				driver->num);
+ 	mutex_lock(&tty_mutex);
+ 	list_del(&driver->tty_drivers);
+ 	mutex_unlock(&tty_mutex);
+ 	return 0;
+ }
+ 
+ EXPORT_SYMBOL(tty_unregister_driver);
+ 
+ dev_t tty_devnum(struct tty_struct *tty)
+ {
+ 	return MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index;
+ }
+ EXPORT_SYMBOL(tty_devnum);
+ 
+ void tty_default_fops(struct file_operations *fops)
+ {
+ 	*fops = tty_fops;
+ }
+ 
+ static char *tty_devnode(struct device *dev, umode_t *mode)
+ {
+ 	if (!mode)
+ 		return NULL;
+ 	if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) ||
+ 	    dev->devt == MKDEV(TTYAUX_MAJOR, 2))
+ 		*mode = 0666;
+ 	return NULL;
+ }
+ 
+ static int __init tty_class_init(void)
+ {
+ 	tty_class = class_create(THIS_MODULE, "tty");
+ 	if (IS_ERR(tty_class))
+ 		return PTR_ERR(tty_class);
+ 	tty_class->devnode = tty_devnode;
+ 	return 0;
+ }
+ 
+ postcore_initcall(tty_class_init);
+ 
+ /* 3/2004 jmc: why do these devices exist? */
+ static struct cdev tty_cdev, console_cdev;
+ 
+ static ssize_t show_cons_active(struct device *dev,
+ 				struct device_attribute *attr, char *buf)
+ {
+ 	struct console *cs[16];
+ 	int i = 0;
+ 	struct console *c;
+ 	ssize_t count = 0;
+ 
+ 	console_lock();
+ 	for_each_console(c) {
+ 		if (!c->device)
+ 			continue;
+ 		if (!c->write)
+ 			continue;
+ 		if ((c->flags & CON_ENABLED) == 0)
+ 			continue;
+ 		cs[i++] = c;
+ 		if (i >= ARRAY_SIZE(cs))
+ 			break;
+ 	}
+ 	while (i--) {
+ 		int index = cs[i]->index;
+ 		struct tty_driver *drv = cs[i]->device(cs[i], &index);
+ 
+ 		/* don't resolve tty0 as some programs depend on it */
+ 		if (drv && (cs[i]->index > 0 || drv->major != TTY_MAJOR))
+ 			count += tty_line_name(drv, index, buf + count);
+ 		else
+ 			count += sprintf(buf + count, "%s%d",
+ 					 cs[i]->name, cs[i]->index);
+ 
+ 		count += sprintf(buf + count, "%c", i ? ' ':'\n');
+ 	}
+ 	console_unlock();
+ 
+ 	return count;
+ }
+ static DEVICE_ATTR(active, S_IRUGO, show_cons_active, NULL);
+ 
+ static struct attribute *cons_dev_attrs[] = {
+ 	&dev_attr_active.attr,
+ 	NULL
+ };
+ 
+ ATTRIBUTE_GROUPS(cons_dev);
+ 
+ static struct device *consdev;
+ 
+ void console_sysfs_notify(void)
+ {
+ 	if (consdev)
+ 		sysfs_notify(&consdev->kobj, NULL, "active");
+ }
+ 
+ /*
+  * Ok, now we can initialize the rest of the tty devices and can count
+  * on memory allocations, interrupts etc..
+  */
+ int __init tty_init(void)
+ {
+ 	tty_sysctl_init();
+ 	cdev_init(&tty_cdev, &tty_fops);
+ 	if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) ||
+ 	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0)
+ 		panic("Couldn't register /dev/tty driver\n");
+ 	device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty");
+ 
+ 	cdev_init(&console_cdev, &console_fops);
+ 	if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) ||
+ 	    register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0)
+ 		panic("Couldn't register /dev/console driver\n");
+ 	consdev = device_create_with_groups(tty_class, NULL,
+ 					    MKDEV(TTYAUX_MAJOR, 1), NULL,
+ 					    cons_dev_groups, "console");
+ 	if (IS_ERR(consdev))
+ 		consdev = NULL;
+ 
+ #ifdef CONFIG_VT
+ 	vty_init(&console_fops);
+ #endif
+ 	return 0;
+ }
diff --color -rcNP Master/drivers/tty/vt/keyboard.c OG/drivers/tty/vt/keyboard.c
*** Master/drivers/tty/vt/keyboard.c	2021-04-20 14:17:30.000000000 -0400
--- OG/drivers/tty/vt/keyboard.c	2021-04-20 15:11:34.505000000 -0400
***************
*** 637,642 ****
--- 637,650 ----
  	     kbd->kbdmode == VC_OFF) &&
  	     value != KVAL(K_SAK))
  		return;		/* SAK is allowed even in raw mode */
+ #if defined(CONFIG_MINISEC_PROC) || defined(CONFIG_MINISEC_PROC_MEMMAP)
+ 	{
+ 		void *func = fn_handler[value];
+ 		if (func == fn_show_state || func == fn_show_ptregs ||
+ 		    func == fn_show_mem)
+ 			return;
+ 	}
+ #endif
  	fn_handler[value](vc);
  }
  
***************
*** 1780,1786 ****
  					  ct * sizeof(struct kbdiacruc));
  			if (IS_ERR(buf))
  				return PTR_ERR(buf);
! 		} 
  		spin_lock_irqsave(&kbd_event_lock, flags);
  		if (ct)
  			memcpy(accent_table, buf,
--- 1788,1794 ----
  					  ct * sizeof(struct kbdiacruc));
  			if (IS_ERR(buf))
  				return PTR_ERR(buf);
! 		}
  		spin_lock_irqsave(&kbd_event_lock, flags);
  		if (ct)
  			memcpy(accent_table, buf,
***************
*** 1918,1923 ****
--- 1926,1933 ----
  		spin_unlock_irqrestore(&kbd_event_lock, flags);
  		return put_user(val, &user_kbe->kb_value);
  	case KDSKBENT:
+ 		if (!capable(CAP_SYS_TTY_CONFIG))
+ 			perm = 0;
  		if (!perm)
  			return -EPERM;
  		if (!i && v == K_NOSUCHMAP) {
diff --color -rcNP Master/drivers/tty/vt/keyboard.c.orig OG/drivers/tty/vt/keyboard.c.orig
*** Master/drivers/tty/vt/keyboard.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/drivers/tty/vt/keyboard.c.orig	2021-04-20 15:10:45.377000000 -0400
***************
*** 0 ****
--- 1,2314 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  * Written for linux by Johan Myreen as a translation from
+  * the assembly version by Linus (with diacriticals added)
+  *
+  * Some additional features added by Christoph Niemann (ChN), March 1993
+  *
+  * Loadable keymaps by Risto Kankkunen, May 1993
+  *
+  * Diacriticals redone & other small changes, aeb@cwi.nl, June 1993
+  * Added decr/incr_console, dynamic keymaps, Unicode support,
+  * dynamic function/string keys, led setting,  Sept 1994
+  * `Sticky' modifier keys, 951006.
+  *
+  * 11-11-96: SAK should now work in the raw mode (Martin Mares)
+  *
+  * Modified to provide 'generic' keyboard support by Hamish Macdonald
+  * Merge with the m68k keyboard driver and split-off of the PC low-level
+  * parts by Geert Uytterhoeven, May 1997
+  *
+  * 27-05-97: Added support for the Magic SysRq Key (Martin Mares)
+  * 30-07-98: Dead keys redone, aeb@cwi.nl.
+  * 21-08-02: Converted to input API, major cleanup. (Vojtech Pavlik)
+  */
+ 
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ 
+ #include <linux/consolemap.h>
+ #include <linux/module.h>
+ #include <linux/sched/signal.h>
+ #include <linux/sched/debug.h>
+ #include <linux/tty.h>
+ #include <linux/tty_flip.h>
+ #include <linux/mm.h>
+ #include <linux/string.h>
+ #include <linux/init.h>
+ #include <linux/slab.h>
+ #include <linux/leds.h>
+ 
+ #include <linux/kbd_kern.h>
+ #include <linux/kbd_diacr.h>
+ #include <linux/vt_kern.h>
+ #include <linux/input.h>
+ #include <linux/reboot.h>
+ #include <linux/notifier.h>
+ #include <linux/jiffies.h>
+ #include <linux/uaccess.h>
+ 
+ #include <asm/irq_regs.h>
+ 
+ extern void ctrl_alt_del(void);
+ 
+ /*
+  * Exported functions/variables
+  */
+ 
+ #define KBD_DEFMODE ((1 << VC_REPEAT) | (1 << VC_META))
+ 
+ #if defined(CONFIG_X86) || defined(CONFIG_PARISC)
+ #include <asm/kbdleds.h>
+ #else
+ static inline int kbd_defleds(void)
+ {
+ 	return 0;
+ }
+ #endif
+ 
+ #define KBD_DEFLOCK 0
+ 
+ /*
+  * Handler Tables.
+  */
+ 
+ #define K_HANDLERS\
+ 	k_self,		k_fn,		k_spec,		k_pad,\
+ 	k_dead,		k_cons,		k_cur,		k_shift,\
+ 	k_meta,		k_ascii,	k_lock,		k_lowercase,\
+ 	k_slock,	k_dead2,	k_brl,		k_ignore
+ 
+ typedef void (k_handler_fn)(struct vc_data *vc, unsigned char value,
+ 			    char up_flag);
+ static k_handler_fn K_HANDLERS;
+ static k_handler_fn *k_handler[16] = { K_HANDLERS };
+ 
+ #define FN_HANDLERS\
+ 	fn_null,	fn_enter,	fn_show_ptregs,	fn_show_mem,\
+ 	fn_show_state,	fn_send_intr,	fn_lastcons,	fn_caps_toggle,\
+ 	fn_num,		fn_hold,	fn_scroll_forw,	fn_scroll_back,\
+ 	fn_boot_it,	fn_caps_on,	fn_compose,	fn_SAK,\
+ 	fn_dec_console, fn_inc_console, fn_spawn_con,	fn_bare_num
+ 
+ typedef void (fn_handler_fn)(struct vc_data *vc);
+ static fn_handler_fn FN_HANDLERS;
+ static fn_handler_fn *fn_handler[] = { FN_HANDLERS };
+ 
+ /*
+  * Variables exported for vt_ioctl.c
+  */
+ 
+ struct vt_spawn_console vt_spawn_con = {
+ 	.lock = __SPIN_LOCK_UNLOCKED(vt_spawn_con.lock),
+ 	.pid  = NULL,
+ 	.sig  = 0,
+ };
+ 
+ 
+ /*
+  * Internal Data.
+  */
+ 
+ static struct kbd_struct kbd_table[MAX_NR_CONSOLES];
+ static struct kbd_struct *kbd = kbd_table;
+ 
+ /* maximum values each key_handler can handle */
+ static const int max_vals[] = {
+ 	255, ARRAY_SIZE(func_table) - 1, ARRAY_SIZE(fn_handler) - 1, NR_PAD - 1,
+ 	NR_DEAD - 1, 255, 3, NR_SHIFT - 1, 255, NR_ASCII - 1, NR_LOCK - 1,
+ 	255, NR_LOCK - 1, 255, NR_BRL - 1
+ };
+ 
+ static const int NR_TYPES = ARRAY_SIZE(max_vals);
+ 
+ static struct input_handler kbd_handler;
+ static DEFINE_SPINLOCK(kbd_event_lock);
+ static DEFINE_SPINLOCK(led_lock);
+ static DEFINE_SPINLOCK(func_buf_lock); /* guard 'func_buf'  and friends */
+ static unsigned long key_down[BITS_TO_LONGS(KEY_CNT)];	/* keyboard key bitmap */
+ static unsigned char shift_down[NR_SHIFT];		/* shift state counters.. */
+ static bool dead_key_next;
+ 
+ /* Handles a number being assembled on the number pad */
+ static bool npadch_active;
+ static unsigned int npadch_value;
+ 
+ static unsigned int diacr;
+ static char rep;					/* flag telling character repeat */
+ 
+ static int shift_state = 0;
+ 
+ static unsigned int ledstate = -1U;			/* undefined */
+ static unsigned char ledioctl;
+ 
+ /*
+  * Notifier list for console keyboard events
+  */
+ static ATOMIC_NOTIFIER_HEAD(keyboard_notifier_list);
+ 
+ int register_keyboard_notifier(struct notifier_block *nb)
+ {
+ 	return atomic_notifier_chain_register(&keyboard_notifier_list, nb);
+ }
+ EXPORT_SYMBOL_GPL(register_keyboard_notifier);
+ 
+ int unregister_keyboard_notifier(struct notifier_block *nb)
+ {
+ 	return atomic_notifier_chain_unregister(&keyboard_notifier_list, nb);
+ }
+ EXPORT_SYMBOL_GPL(unregister_keyboard_notifier);
+ 
+ /*
+  * Translation of scancodes to keycodes. We set them on only the first
+  * keyboard in the list that accepts the scancode and keycode.
+  * Explanation for not choosing the first attached keyboard anymore:
+  *  USB keyboards for example have two event devices: one for all "normal"
+  *  keys and one for extra function keys (like "volume up", "make coffee",
+  *  etc.). So this means that scancodes for the extra function keys won't
+  *  be valid for the first event device, but will be for the second.
+  */
+ 
+ struct getset_keycode_data {
+ 	struct input_keymap_entry ke;
+ 	int error;
+ };
+ 
+ static int getkeycode_helper(struct input_handle *handle, void *data)
+ {
+ 	struct getset_keycode_data *d = data;
+ 
+ 	d->error = input_get_keycode(handle->dev, &d->ke);
+ 
+ 	return d->error == 0; /* stop as soon as we successfully get one */
+ }
+ 
+ static int getkeycode(unsigned int scancode)
+ {
+ 	struct getset_keycode_data d = {
+ 		.ke	= {
+ 			.flags		= 0,
+ 			.len		= sizeof(scancode),
+ 			.keycode	= 0,
+ 		},
+ 		.error	= -ENODEV,
+ 	};
+ 
+ 	memcpy(d.ke.scancode, &scancode, sizeof(scancode));
+ 
+ 	input_handler_for_each_handle(&kbd_handler, &d, getkeycode_helper);
+ 
+ 	return d.error ?: d.ke.keycode;
+ }
+ 
+ static int setkeycode_helper(struct input_handle *handle, void *data)
+ {
+ 	struct getset_keycode_data *d = data;
+ 
+ 	d->error = input_set_keycode(handle->dev, &d->ke);
+ 
+ 	return d->error == 0; /* stop as soon as we successfully set one */
+ }
+ 
+ static int setkeycode(unsigned int scancode, unsigned int keycode)
+ {
+ 	struct getset_keycode_data d = {
+ 		.ke	= {
+ 			.flags		= 0,
+ 			.len		= sizeof(scancode),
+ 			.keycode	= keycode,
+ 		},
+ 		.error	= -ENODEV,
+ 	};
+ 
+ 	memcpy(d.ke.scancode, &scancode, sizeof(scancode));
+ 
+ 	input_handler_for_each_handle(&kbd_handler, &d, setkeycode_helper);
+ 
+ 	return d.error;
+ }
+ 
+ /*
+  * Making beeps and bells. Note that we prefer beeps to bells, but when
+  * shutting the sound off we do both.
+  */
+ 
+ static int kd_sound_helper(struct input_handle *handle, void *data)
+ {
+ 	unsigned int *hz = data;
+ 	struct input_dev *dev = handle->dev;
+ 
+ 	if (test_bit(EV_SND, dev->evbit)) {
+ 		if (test_bit(SND_TONE, dev->sndbit)) {
+ 			input_inject_event(handle, EV_SND, SND_TONE, *hz);
+ 			if (*hz)
+ 				return 0;
+ 		}
+ 		if (test_bit(SND_BELL, dev->sndbit))
+ 			input_inject_event(handle, EV_SND, SND_BELL, *hz ? 1 : 0);
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static void kd_nosound(struct timer_list *unused)
+ {
+ 	static unsigned int zero;
+ 
+ 	input_handler_for_each_handle(&kbd_handler, &zero, kd_sound_helper);
+ }
+ 
+ static DEFINE_TIMER(kd_mksound_timer, kd_nosound);
+ 
+ void kd_mksound(unsigned int hz, unsigned int ticks)
+ {
+ 	del_timer_sync(&kd_mksound_timer);
+ 
+ 	input_handler_for_each_handle(&kbd_handler, &hz, kd_sound_helper);
+ 
+ 	if (hz && ticks)
+ 		mod_timer(&kd_mksound_timer, jiffies + ticks);
+ }
+ EXPORT_SYMBOL(kd_mksound);
+ 
+ /*
+  * Setting the keyboard rate.
+  */
+ 
+ static int kbd_rate_helper(struct input_handle *handle, void *data)
+ {
+ 	struct input_dev *dev = handle->dev;
+ 	struct kbd_repeat *rpt = data;
+ 
+ 	if (test_bit(EV_REP, dev->evbit)) {
+ 
+ 		if (rpt[0].delay > 0)
+ 			input_inject_event(handle,
+ 					   EV_REP, REP_DELAY, rpt[0].delay);
+ 		if (rpt[0].period > 0)
+ 			input_inject_event(handle,
+ 					   EV_REP, REP_PERIOD, rpt[0].period);
+ 
+ 		rpt[1].delay = dev->rep[REP_DELAY];
+ 		rpt[1].period = dev->rep[REP_PERIOD];
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ int kbd_rate(struct kbd_repeat *rpt)
+ {
+ 	struct kbd_repeat data[2] = { *rpt };
+ 
+ 	input_handler_for_each_handle(&kbd_handler, data, kbd_rate_helper);
+ 	*rpt = data[1];	/* Copy currently used settings */
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Helper Functions.
+  */
+ static void put_queue(struct vc_data *vc, int ch)
+ {
+ 	tty_insert_flip_char(&vc->port, ch, 0);
+ 	tty_schedule_flip(&vc->port);
+ }
+ 
+ static void puts_queue(struct vc_data *vc, char *cp)
+ {
+ 	while (*cp) {
+ 		tty_insert_flip_char(&vc->port, *cp, 0);
+ 		cp++;
+ 	}
+ 	tty_schedule_flip(&vc->port);
+ }
+ 
+ static void applkey(struct vc_data *vc, int key, char mode)
+ {
+ 	static char buf[] = { 0x1b, 'O', 0x00, 0x00 };
+ 
+ 	buf[1] = (mode ? 'O' : '[');
+ 	buf[2] = key;
+ 	puts_queue(vc, buf);
+ }
+ 
+ /*
+  * Many other routines do put_queue, but I think either
+  * they produce ASCII, or they produce some user-assigned
+  * string, and in both cases we might assume that it is
+  * in utf-8 already.
+  */
+ static void to_utf8(struct vc_data *vc, uint c)
+ {
+ 	if (c < 0x80)
+ 		/*  0******* */
+ 		put_queue(vc, c);
+ 	else if (c < 0x800) {
+ 		/* 110***** 10****** */
+ 		put_queue(vc, 0xc0 | (c >> 6));
+ 		put_queue(vc, 0x80 | (c & 0x3f));
+ 	} else if (c < 0x10000) {
+ 		if (c >= 0xD800 && c < 0xE000)
+ 			return;
+ 		if (c == 0xFFFF)
+ 			return;
+ 		/* 1110**** 10****** 10****** */
+ 		put_queue(vc, 0xe0 | (c >> 12));
+ 		put_queue(vc, 0x80 | ((c >> 6) & 0x3f));
+ 		put_queue(vc, 0x80 | (c & 0x3f));
+ 	} else if (c < 0x110000) {
+ 		/* 11110*** 10****** 10****** 10****** */
+ 		put_queue(vc, 0xf0 | (c >> 18));
+ 		put_queue(vc, 0x80 | ((c >> 12) & 0x3f));
+ 		put_queue(vc, 0x80 | ((c >> 6) & 0x3f));
+ 		put_queue(vc, 0x80 | (c & 0x3f));
+ 	}
+ }
+ 
+ /*
+  * Called after returning from RAW mode or when changing consoles - recompute
+  * shift_down[] and shift_state from key_down[] maybe called when keymap is
+  * undefined, so that shiftkey release is seen. The caller must hold the
+  * kbd_event_lock.
+  */
+ 
+ static void do_compute_shiftstate(void)
+ {
+ 	unsigned int k, sym, val;
+ 
+ 	shift_state = 0;
+ 	memset(shift_down, 0, sizeof(shift_down));
+ 
+ 	for_each_set_bit(k, key_down, min(NR_KEYS, KEY_CNT)) {
+ 		sym = U(key_maps[0][k]);
+ 		if (KTYP(sym) != KT_SHIFT && KTYP(sym) != KT_SLOCK)
+ 			continue;
+ 
+ 		val = KVAL(sym);
+ 		if (val == KVAL(K_CAPSSHIFT))
+ 			val = KVAL(K_SHIFT);
+ 
+ 		shift_down[val]++;
+ 		shift_state |= BIT(val);
+ 	}
+ }
+ 
+ /* We still have to export this method to vt.c */
+ void compute_shiftstate(void)
+ {
+ 	unsigned long flags;
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	do_compute_shiftstate();
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ }
+ 
+ /*
+  * We have a combining character DIACR here, followed by the character CH.
+  * If the combination occurs in the table, return the corresponding value.
+  * Otherwise, if CH is a space or equals DIACR, return DIACR.
+  * Otherwise, conclude that DIACR was not combining after all,
+  * queue it and return CH.
+  */
+ static unsigned int handle_diacr(struct vc_data *vc, unsigned int ch)
+ {
+ 	unsigned int d = diacr;
+ 	unsigned int i;
+ 
+ 	diacr = 0;
+ 
+ 	if ((d & ~0xff) == BRL_UC_ROW) {
+ 		if ((ch & ~0xff) == BRL_UC_ROW)
+ 			return d | ch;
+ 	} else {
+ 		for (i = 0; i < accent_table_size; i++)
+ 			if (accent_table[i].diacr == d && accent_table[i].base == ch)
+ 				return accent_table[i].result;
+ 	}
+ 
+ 	if (ch == ' ' || ch == (BRL_UC_ROW|0) || ch == d)
+ 		return d;
+ 
+ 	if (kbd->kbdmode == VC_UNICODE)
+ 		to_utf8(vc, d);
+ 	else {
+ 		int c = conv_uni_to_8bit(d);
+ 		if (c != -1)
+ 			put_queue(vc, c);
+ 	}
+ 
+ 	return ch;
+ }
+ 
+ /*
+  * Special function handlers
+  */
+ static void fn_enter(struct vc_data *vc)
+ {
+ 	if (diacr) {
+ 		if (kbd->kbdmode == VC_UNICODE)
+ 			to_utf8(vc, diacr);
+ 		else {
+ 			int c = conv_uni_to_8bit(diacr);
+ 			if (c != -1)
+ 				put_queue(vc, c);
+ 		}
+ 		diacr = 0;
+ 	}
+ 
+ 	put_queue(vc, 13);
+ 	if (vc_kbd_mode(kbd, VC_CRLF))
+ 		put_queue(vc, 10);
+ }
+ 
+ static void fn_caps_toggle(struct vc_data *vc)
+ {
+ 	if (rep)
+ 		return;
+ 
+ 	chg_vc_kbd_led(kbd, VC_CAPSLOCK);
+ }
+ 
+ static void fn_caps_on(struct vc_data *vc)
+ {
+ 	if (rep)
+ 		return;
+ 
+ 	set_vc_kbd_led(kbd, VC_CAPSLOCK);
+ }
+ 
+ static void fn_show_ptregs(struct vc_data *vc)
+ {
+ 	struct pt_regs *regs = get_irq_regs();
+ 
+ 	if (regs)
+ 		show_regs(regs);
+ }
+ 
+ static void fn_hold(struct vc_data *vc)
+ {
+ 	struct tty_struct *tty = vc->port.tty;
+ 
+ 	if (rep || !tty)
+ 		return;
+ 
+ 	/*
+ 	 * Note: SCROLLOCK will be set (cleared) by stop_tty (start_tty);
+ 	 * these routines are also activated by ^S/^Q.
+ 	 * (And SCROLLOCK can also be set by the ioctl KDSKBLED.)
+ 	 */
+ 	if (tty->stopped)
+ 		start_tty(tty);
+ 	else
+ 		stop_tty(tty);
+ }
+ 
+ static void fn_num(struct vc_data *vc)
+ {
+ 	if (vc_kbd_mode(kbd, VC_APPLIC))
+ 		applkey(vc, 'P', 1);
+ 	else
+ 		fn_bare_num(vc);
+ }
+ 
+ /*
+  * Bind this to Shift-NumLock if you work in application keypad mode
+  * but want to be able to change the NumLock flag.
+  * Bind this to NumLock if you prefer that the NumLock key always
+  * changes the NumLock flag.
+  */
+ static void fn_bare_num(struct vc_data *vc)
+ {
+ 	if (!rep)
+ 		chg_vc_kbd_led(kbd, VC_NUMLOCK);
+ }
+ 
+ static void fn_lastcons(struct vc_data *vc)
+ {
+ 	/* switch to the last used console, ChN */
+ 	set_console(last_console);
+ }
+ 
+ static void fn_dec_console(struct vc_data *vc)
+ {
+ 	int i, cur = fg_console;
+ 
+ 	/* Currently switching?  Queue this next switch relative to that. */
+ 	if (want_console != -1)
+ 		cur = want_console;
+ 
+ 	for (i = cur - 1; i != cur; i--) {
+ 		if (i == -1)
+ 			i = MAX_NR_CONSOLES - 1;
+ 		if (vc_cons_allocated(i))
+ 			break;
+ 	}
+ 	set_console(i);
+ }
+ 
+ static void fn_inc_console(struct vc_data *vc)
+ {
+ 	int i, cur = fg_console;
+ 
+ 	/* Currently switching?  Queue this next switch relative to that. */
+ 	if (want_console != -1)
+ 		cur = want_console;
+ 
+ 	for (i = cur+1; i != cur; i++) {
+ 		if (i == MAX_NR_CONSOLES)
+ 			i = 0;
+ 		if (vc_cons_allocated(i))
+ 			break;
+ 	}
+ 	set_console(i);
+ }
+ 
+ static void fn_send_intr(struct vc_data *vc)
+ {
+ 	tty_insert_flip_char(&vc->port, 0, TTY_BREAK);
+ 	tty_schedule_flip(&vc->port);
+ }
+ 
+ static void fn_scroll_forw(struct vc_data *vc)
+ {
+ 	scrollfront(vc, 0);
+ }
+ 
+ static void fn_scroll_back(struct vc_data *vc)
+ {
+ 	scrollback(vc);
+ }
+ 
+ static void fn_show_mem(struct vc_data *vc)
+ {
+ 	show_mem(0, NULL);
+ }
+ 
+ static void fn_show_state(struct vc_data *vc)
+ {
+ 	show_state();
+ }
+ 
+ static void fn_boot_it(struct vc_data *vc)
+ {
+ 	ctrl_alt_del();
+ }
+ 
+ static void fn_compose(struct vc_data *vc)
+ {
+ 	dead_key_next = true;
+ }
+ 
+ static void fn_spawn_con(struct vc_data *vc)
+ {
+ 	spin_lock(&vt_spawn_con.lock);
+ 	if (vt_spawn_con.pid)
+ 		if (kill_pid(vt_spawn_con.pid, vt_spawn_con.sig, 1)) {
+ 			put_pid(vt_spawn_con.pid);
+ 			vt_spawn_con.pid = NULL;
+ 		}
+ 	spin_unlock(&vt_spawn_con.lock);
+ }
+ 
+ static void fn_SAK(struct vc_data *vc)
+ {
+ 	struct work_struct *SAK_work = &vc_cons[fg_console].SAK_work;
+ 	schedule_work(SAK_work);
+ }
+ 
+ static void fn_null(struct vc_data *vc)
+ {
+ 	do_compute_shiftstate();
+ }
+ 
+ /*
+  * Special key handlers
+  */
+ static void k_ignore(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ }
+ 
+ static void k_spec(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	if (up_flag)
+ 		return;
+ 	if (value >= ARRAY_SIZE(fn_handler))
+ 		return;
+ 	if ((kbd->kbdmode == VC_RAW ||
+ 	     kbd->kbdmode == VC_MEDIUMRAW ||
+ 	     kbd->kbdmode == VC_OFF) &&
+ 	     value != KVAL(K_SAK))
+ 		return;		/* SAK is allowed even in raw mode */
+ #if defined(CONFIG_MINISEC_PROC) || defined(CONFIG_MINISEC_PROC_MEMMAP)
+ 	{
+ 		void *func = fn_handler[value];
+ 		if (func == fn_show_state || func == fn_show_ptregs ||
+ 		    func == fn_show_mem)
+ 			return;
+ 	}
+ #endif
+ 	fn_handler[value](vc);
+ }
+ 
+ static void k_lowercase(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	pr_err("k_lowercase was called - impossible\n");
+ }
+ 
+ static void k_unicode(struct vc_data *vc, unsigned int value, char up_flag)
+ {
+ 	if (up_flag)
+ 		return;		/* no action, if this is a key release */
+ 
+ 	if (diacr)
+ 		value = handle_diacr(vc, value);
+ 
+ 	if (dead_key_next) {
+ 		dead_key_next = false;
+ 		diacr = value;
+ 		return;
+ 	}
+ 	if (kbd->kbdmode == VC_UNICODE)
+ 		to_utf8(vc, value);
+ 	else {
+ 		int c = conv_uni_to_8bit(value);
+ 		if (c != -1)
+ 			put_queue(vc, c);
+ 	}
+ }
+ 
+ /*
+  * Handle dead key. Note that we now may have several
+  * dead keys modifying the same character. Very useful
+  * for Vietnamese.
+  */
+ static void k_deadunicode(struct vc_data *vc, unsigned int value, char up_flag)
+ {
+ 	if (up_flag)
+ 		return;
+ 
+ 	diacr = (diacr ? handle_diacr(vc, value) : value);
+ }
+ 
+ static void k_self(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	k_unicode(vc, conv_8bit_to_uni(value), up_flag);
+ }
+ 
+ static void k_dead2(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	k_deadunicode(vc, value, up_flag);
+ }
+ 
+ /*
+  * Obsolete - for backwards compatibility only
+  */
+ static void k_dead(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	static const unsigned char ret_diacr[NR_DEAD] = {
+ 		'`',	/* dead_grave */
+ 		'\'',	/* dead_acute */
+ 		'^',	/* dead_circumflex */
+ 		'~',	/* dead_tilda */
+ 		'"',	/* dead_diaeresis */
+ 		',',	/* dead_cedilla */
+ 		'_',	/* dead_macron */
+ 		'U',	/* dead_breve */
+ 		'.',	/* dead_abovedot */
+ 		'*',	/* dead_abovering */
+ 		'=',	/* dead_doubleacute */
+ 		'c',	/* dead_caron */
+ 		'k',	/* dead_ogonek */
+ 		'i',	/* dead_iota */
+ 		'#',	/* dead_voiced_sound */
+ 		'o',	/* dead_semivoiced_sound */
+ 		'!',	/* dead_belowdot */
+ 		'?',	/* dead_hook */
+ 		'+',	/* dead_horn */
+ 		'-',	/* dead_stroke */
+ 		')',	/* dead_abovecomma */
+ 		'(',	/* dead_abovereversedcomma */
+ 		':',	/* dead_doublegrave */
+ 		'n',	/* dead_invertedbreve */
+ 		';',	/* dead_belowcomma */
+ 		'$',	/* dead_currency */
+ 		'@',	/* dead_greek */
+ 	};
+ 
+ 	k_deadunicode(vc, ret_diacr[value], up_flag);
+ }
+ 
+ static void k_cons(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	if (up_flag)
+ 		return;
+ 
+ 	set_console(value);
+ }
+ 
+ static void k_fn(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	if (up_flag)
+ 		return;
+ 
+ 	if ((unsigned)value < ARRAY_SIZE(func_table)) {
+ 		unsigned long flags;
+ 
+ 		spin_lock_irqsave(&func_buf_lock, flags);
+ 		if (func_table[value])
+ 			puts_queue(vc, func_table[value]);
+ 		spin_unlock_irqrestore(&func_buf_lock, flags);
+ 
+ 	} else
+ 		pr_err("k_fn called with value=%d\n", value);
+ }
+ 
+ static void k_cur(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	static const char cur_chars[] = "BDCA";
+ 
+ 	if (up_flag)
+ 		return;
+ 
+ 	applkey(vc, cur_chars[value], vc_kbd_mode(kbd, VC_CKMODE));
+ }
+ 
+ static void k_pad(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	static const char pad_chars[] = "0123456789+-*/\015,.?()#";
+ 	static const char app_map[] = "pqrstuvwxylSRQMnnmPQS";
+ 
+ 	if (up_flag)
+ 		return;		/* no action, if this is a key release */
+ 
+ 	/* kludge... shift forces cursor/number keys */
+ 	if (vc_kbd_mode(kbd, VC_APPLIC) && !shift_down[KG_SHIFT]) {
+ 		applkey(vc, app_map[value], 1);
+ 		return;
+ 	}
+ 
+ 	if (!vc_kbd_led(kbd, VC_NUMLOCK)) {
+ 
+ 		switch (value) {
+ 		case KVAL(K_PCOMMA):
+ 		case KVAL(K_PDOT):
+ 			k_fn(vc, KVAL(K_REMOVE), 0);
+ 			return;
+ 		case KVAL(K_P0):
+ 			k_fn(vc, KVAL(K_INSERT), 0);
+ 			return;
+ 		case KVAL(K_P1):
+ 			k_fn(vc, KVAL(K_SELECT), 0);
+ 			return;
+ 		case KVAL(K_P2):
+ 			k_cur(vc, KVAL(K_DOWN), 0);
+ 			return;
+ 		case KVAL(K_P3):
+ 			k_fn(vc, KVAL(K_PGDN), 0);
+ 			return;
+ 		case KVAL(K_P4):
+ 			k_cur(vc, KVAL(K_LEFT), 0);
+ 			return;
+ 		case KVAL(K_P6):
+ 			k_cur(vc, KVAL(K_RIGHT), 0);
+ 			return;
+ 		case KVAL(K_P7):
+ 			k_fn(vc, KVAL(K_FIND), 0);
+ 			return;
+ 		case KVAL(K_P8):
+ 			k_cur(vc, KVAL(K_UP), 0);
+ 			return;
+ 		case KVAL(K_P9):
+ 			k_fn(vc, KVAL(K_PGUP), 0);
+ 			return;
+ 		case KVAL(K_P5):
+ 			applkey(vc, 'G', vc_kbd_mode(kbd, VC_APPLIC));
+ 			return;
+ 		}
+ 	}
+ 
+ 	put_queue(vc, pad_chars[value]);
+ 	if (value == KVAL(K_PENTER) && vc_kbd_mode(kbd, VC_CRLF))
+ 		put_queue(vc, 10);
+ }
+ 
+ static void k_shift(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	int old_state = shift_state;
+ 
+ 	if (rep)
+ 		return;
+ 	/*
+ 	 * Mimic typewriter:
+ 	 * a CapsShift key acts like Shift but undoes CapsLock
+ 	 */
+ 	if (value == KVAL(K_CAPSSHIFT)) {
+ 		value = KVAL(K_SHIFT);
+ 		if (!up_flag)
+ 			clr_vc_kbd_led(kbd, VC_CAPSLOCK);
+ 	}
+ 
+ 	if (up_flag) {
+ 		/*
+ 		 * handle the case that two shift or control
+ 		 * keys are depressed simultaneously
+ 		 */
+ 		if (shift_down[value])
+ 			shift_down[value]--;
+ 	} else
+ 		shift_down[value]++;
+ 
+ 	if (shift_down[value])
+ 		shift_state |= (1 << value);
+ 	else
+ 		shift_state &= ~(1 << value);
+ 
+ 	/* kludge */
+ 	if (up_flag && shift_state != old_state && npadch_active) {
+ 		if (kbd->kbdmode == VC_UNICODE)
+ 			to_utf8(vc, npadch_value);
+ 		else
+ 			put_queue(vc, npadch_value & 0xff);
+ 		npadch_active = false;
+ 	}
+ }
+ 
+ static void k_meta(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	if (up_flag)
+ 		return;
+ 
+ 	if (vc_kbd_mode(kbd, VC_META)) {
+ 		put_queue(vc, '\033');
+ 		put_queue(vc, value);
+ 	} else
+ 		put_queue(vc, value | 0x80);
+ }
+ 
+ static void k_ascii(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	unsigned int base;
+ 
+ 	if (up_flag)
+ 		return;
+ 
+ 	if (value < 10) {
+ 		/* decimal input of code, while Alt depressed */
+ 		base = 10;
+ 	} else {
+ 		/* hexadecimal input of code, while AltGr depressed */
+ 		value -= 10;
+ 		base = 16;
+ 	}
+ 
+ 	if (!npadch_active) {
+ 		npadch_value = 0;
+ 		npadch_active = true;
+ 	}
+ 
+ 	npadch_value = npadch_value * base + value;
+ }
+ 
+ static void k_lock(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	if (up_flag || rep)
+ 		return;
+ 
+ 	chg_vc_kbd_lock(kbd, value);
+ }
+ 
+ static void k_slock(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	k_shift(vc, value, up_flag);
+ 	if (up_flag || rep)
+ 		return;
+ 
+ 	chg_vc_kbd_slock(kbd, value);
+ 	/* try to make Alt, oops, AltGr and such work */
+ 	if (!key_maps[kbd->lockstate ^ kbd->slockstate]) {
+ 		kbd->slockstate = 0;
+ 		chg_vc_kbd_slock(kbd, value);
+ 	}
+ }
+ 
+ /* by default, 300ms interval for combination release */
+ static unsigned brl_timeout = 300;
+ MODULE_PARM_DESC(brl_timeout, "Braille keys release delay in ms (0 for commit on first key release)");
+ module_param(brl_timeout, uint, 0644);
+ 
+ static unsigned brl_nbchords = 1;
+ MODULE_PARM_DESC(brl_nbchords, "Number of chords that produce a braille pattern (0 for dead chords)");
+ module_param(brl_nbchords, uint, 0644);
+ 
+ static void k_brlcommit(struct vc_data *vc, unsigned int pattern, char up_flag)
+ {
+ 	static unsigned long chords;
+ 	static unsigned committed;
+ 
+ 	if (!brl_nbchords)
+ 		k_deadunicode(vc, BRL_UC_ROW | pattern, up_flag);
+ 	else {
+ 		committed |= pattern;
+ 		chords++;
+ 		if (chords == brl_nbchords) {
+ 			k_unicode(vc, BRL_UC_ROW | committed, up_flag);
+ 			chords = 0;
+ 			committed = 0;
+ 		}
+ 	}
+ }
+ 
+ static void k_brl(struct vc_data *vc, unsigned char value, char up_flag)
+ {
+ 	static unsigned pressed, committing;
+ 	static unsigned long releasestart;
+ 
+ 	if (kbd->kbdmode != VC_UNICODE) {
+ 		if (!up_flag)
+ 			pr_warn("keyboard mode must be unicode for braille patterns\n");
+ 		return;
+ 	}
+ 
+ 	if (!value) {
+ 		k_unicode(vc, BRL_UC_ROW, up_flag);
+ 		return;
+ 	}
+ 
+ 	if (value > 8)
+ 		return;
+ 
+ 	if (!up_flag) {
+ 		pressed |= 1 << (value - 1);
+ 		if (!brl_timeout)
+ 			committing = pressed;
+ 	} else if (brl_timeout) {
+ 		if (!committing ||
+ 		    time_after(jiffies,
+ 			       releasestart + msecs_to_jiffies(brl_timeout))) {
+ 			committing = pressed;
+ 			releasestart = jiffies;
+ 		}
+ 		pressed &= ~(1 << (value - 1));
+ 		if (!pressed && committing) {
+ 			k_brlcommit(vc, committing, 0);
+ 			committing = 0;
+ 		}
+ 	} else {
+ 		if (committing) {
+ 			k_brlcommit(vc, committing, 0);
+ 			committing = 0;
+ 		}
+ 		pressed &= ~(1 << (value - 1));
+ 	}
+ }
+ 
+ #if IS_ENABLED(CONFIG_INPUT_LEDS) && IS_ENABLED(CONFIG_LEDS_TRIGGERS)
+ 
+ struct kbd_led_trigger {
+ 	struct led_trigger trigger;
+ 	unsigned int mask;
+ };
+ 
+ static int kbd_led_trigger_activate(struct led_classdev *cdev)
+ {
+ 	struct kbd_led_trigger *trigger =
+ 		container_of(cdev->trigger, struct kbd_led_trigger, trigger);
+ 
+ 	tasklet_disable(&keyboard_tasklet);
+ 	if (ledstate != -1U)
+ 		led_trigger_event(&trigger->trigger,
+ 				  ledstate & trigger->mask ?
+ 					LED_FULL : LED_OFF);
+ 	tasklet_enable(&keyboard_tasklet);
+ 
+ 	return 0;
+ }
+ 
+ #define KBD_LED_TRIGGER(_led_bit, _name) {			\
+ 		.trigger = {					\
+ 			.name = _name,				\
+ 			.activate = kbd_led_trigger_activate,	\
+ 		},						\
+ 		.mask	= BIT(_led_bit),			\
+ 	}
+ 
+ #define KBD_LOCKSTATE_TRIGGER(_led_bit, _name)		\
+ 	KBD_LED_TRIGGER((_led_bit) + 8, _name)
+ 
+ static struct kbd_led_trigger kbd_led_triggers[] = {
+ 	KBD_LED_TRIGGER(VC_SCROLLOCK, "kbd-scrolllock"),
+ 	KBD_LED_TRIGGER(VC_NUMLOCK,   "kbd-numlock"),
+ 	KBD_LED_TRIGGER(VC_CAPSLOCK,  "kbd-capslock"),
+ 	KBD_LED_TRIGGER(VC_KANALOCK,  "kbd-kanalock"),
+ 
+ 	KBD_LOCKSTATE_TRIGGER(VC_SHIFTLOCK,  "kbd-shiftlock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_ALTGRLOCK,  "kbd-altgrlock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_CTRLLOCK,   "kbd-ctrllock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_ALTLOCK,    "kbd-altlock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_SHIFTLLOCK, "kbd-shiftllock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_SHIFTRLOCK, "kbd-shiftrlock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_CTRLLLOCK,  "kbd-ctrlllock"),
+ 	KBD_LOCKSTATE_TRIGGER(VC_CTRLRLOCK,  "kbd-ctrlrlock"),
+ };
+ 
+ static void kbd_propagate_led_state(unsigned int old_state,
+ 				    unsigned int new_state)
+ {
+ 	struct kbd_led_trigger *trigger;
+ 	unsigned int changed = old_state ^ new_state;
+ 	int i;
+ 
+ 	for (i = 0; i < ARRAY_SIZE(kbd_led_triggers); i++) {
+ 		trigger = &kbd_led_triggers[i];
+ 
+ 		if (changed & trigger->mask)
+ 			led_trigger_event(&trigger->trigger,
+ 					  new_state & trigger->mask ?
+ 						LED_FULL : LED_OFF);
+ 	}
+ }
+ 
+ static int kbd_update_leds_helper(struct input_handle *handle, void *data)
+ {
+ 	unsigned int led_state = *(unsigned int *)data;
+ 
+ 	if (test_bit(EV_LED, handle->dev->evbit))
+ 		kbd_propagate_led_state(~led_state, led_state);
+ 
+ 	return 0;
+ }
+ 
+ static void kbd_init_leds(void)
+ {
+ 	int error;
+ 	int i;
+ 
+ 	for (i = 0; i < ARRAY_SIZE(kbd_led_triggers); i++) {
+ 		error = led_trigger_register(&kbd_led_triggers[i].trigger);
+ 		if (error)
+ 			pr_err("error %d while registering trigger %s\n",
+ 			       error, kbd_led_triggers[i].trigger.name);
+ 	}
+ }
+ 
+ #else
+ 
+ static int kbd_update_leds_helper(struct input_handle *handle, void *data)
+ {
+ 	unsigned int leds = *(unsigned int *)data;
+ 
+ 	if (test_bit(EV_LED, handle->dev->evbit)) {
+ 		input_inject_event(handle, EV_LED, LED_SCROLLL, !!(leds & 0x01));
+ 		input_inject_event(handle, EV_LED, LED_NUML,    !!(leds & 0x02));
+ 		input_inject_event(handle, EV_LED, LED_CAPSL,   !!(leds & 0x04));
+ 		input_inject_event(handle, EV_SYN, SYN_REPORT, 0);
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static void kbd_propagate_led_state(unsigned int old_state,
+ 				    unsigned int new_state)
+ {
+ 	input_handler_for_each_handle(&kbd_handler, &new_state,
+ 				      kbd_update_leds_helper);
+ }
+ 
+ static void kbd_init_leds(void)
+ {
+ }
+ 
+ #endif
+ 
+ /*
+  * The leds display either (i) the status of NumLock, CapsLock, ScrollLock,
+  * or (ii) whatever pattern of lights people want to show using KDSETLED,
+  * or (iii) specified bits of specified words in kernel memory.
+  */
+ static unsigned char getledstate(void)
+ {
+ 	return ledstate & 0xff;
+ }
+ 
+ void setledstate(struct kbd_struct *kb, unsigned int led)
+ {
+         unsigned long flags;
+         spin_lock_irqsave(&led_lock, flags);
+ 	if (!(led & ~7)) {
+ 		ledioctl = led;
+ 		kb->ledmode = LED_SHOW_IOCTL;
+ 	} else
+ 		kb->ledmode = LED_SHOW_FLAGS;
+ 
+ 	set_leds();
+ 	spin_unlock_irqrestore(&led_lock, flags);
+ }
+ 
+ static inline unsigned char getleds(void)
+ {
+ 	struct kbd_struct *kb = kbd_table + fg_console;
+ 
+ 	if (kb->ledmode == LED_SHOW_IOCTL)
+ 		return ledioctl;
+ 
+ 	return kb->ledflagstate;
+ }
+ 
+ /**
+  *	vt_get_leds	-	helper for braille console
+  *	@console: console to read
+  *	@flag: flag we want to check
+  *
+  *	Check the status of a keyboard led flag and report it back
+  */
+ int vt_get_leds(int console, int flag)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	int ret;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&led_lock, flags);
+ 	ret = vc_kbd_led(kb, flag);
+ 	spin_unlock_irqrestore(&led_lock, flags);
+ 
+ 	return ret;
+ }
+ EXPORT_SYMBOL_GPL(vt_get_leds);
+ 
+ /**
+  *	vt_set_led_state	-	set LED state of a console
+  *	@console: console to set
+  *	@leds: LED bits
+  *
+  *	Set the LEDs on a console. This is a wrapper for the VT layer
+  *	so that we can keep kbd knowledge internal
+  */
+ void vt_set_led_state(int console, int leds)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	setledstate(kb, leds);
+ }
+ 
+ /**
+  *	vt_kbd_con_start	-	Keyboard side of console start
+  *	@console: console
+  *
+  *	Handle console start. This is a wrapper for the VT layer
+  *	so that we can keep kbd knowledge internal
+  *
+  *	FIXME: We eventually need to hold the kbd lock here to protect
+  *	the LED updating. We can't do it yet because fn_hold calls stop_tty
+  *	and start_tty under the kbd_event_lock, while normal tty paths
+  *	don't hold the lock. We probably need to split out an LED lock
+  *	but not during an -rc release!
+  */
+ void vt_kbd_con_start(int console)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	unsigned long flags;
+ 	spin_lock_irqsave(&led_lock, flags);
+ 	clr_vc_kbd_led(kb, VC_SCROLLOCK);
+ 	set_leds();
+ 	spin_unlock_irqrestore(&led_lock, flags);
+ }
+ 
+ /**
+  *	vt_kbd_con_stop		-	Keyboard side of console stop
+  *	@console: console
+  *
+  *	Handle console stop. This is a wrapper for the VT layer
+  *	so that we can keep kbd knowledge internal
+  */
+ void vt_kbd_con_stop(int console)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	unsigned long flags;
+ 	spin_lock_irqsave(&led_lock, flags);
+ 	set_vc_kbd_led(kb, VC_SCROLLOCK);
+ 	set_leds();
+ 	spin_unlock_irqrestore(&led_lock, flags);
+ }
+ 
+ /*
+  * This is the tasklet that updates LED state of LEDs using standard
+  * keyboard triggers. The reason we use tasklet is that we need to
+  * handle the scenario when keyboard handler is not registered yet
+  * but we already getting updates from the VT to update led state.
+  */
+ static void kbd_bh(unsigned long dummy)
+ {
+ 	unsigned int leds;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&led_lock, flags);
+ 	leds = getleds();
+ 	leds |= (unsigned int)kbd->lockstate << 8;
+ 	spin_unlock_irqrestore(&led_lock, flags);
+ 
+ 	if (leds != ledstate) {
+ 		kbd_propagate_led_state(ledstate, leds);
+ 		ledstate = leds;
+ 	}
+ }
+ 
+ DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0);
+ 
+ #if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_ALPHA) ||\
+     defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) ||\
+     defined(CONFIG_PARISC) || defined(CONFIG_SUPERH) ||\
+     (defined(CONFIG_ARM) && defined(CONFIG_KEYBOARD_ATKBD) && !defined(CONFIG_ARCH_RPC))
+ 
+ #define HW_RAW(dev) (test_bit(EV_MSC, dev->evbit) && test_bit(MSC_RAW, dev->mscbit) &&\
+ 			((dev)->id.bustype == BUS_I8042) && ((dev)->id.vendor == 0x0001) && ((dev)->id.product == 0x0001))
+ 
+ static const unsigned short x86_keycodes[256] =
+ 	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+ 	 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 	 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 	 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 	 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 	 80, 81, 82, 83, 84,118, 86, 87, 88,115,120,119,121,112,123, 92,
+ 	284,285,309,  0,312, 91,327,328,329,331,333,335,336,337,338,339,
+ 	367,288,302,304,350, 89,334,326,267,126,268,269,125,347,348,349,
+ 	360,261,262,263,268,376,100,101,321,316,373,286,289,102,351,355,
+ 	103,104,105,275,287,279,258,106,274,107,294,364,358,363,362,361,
+ 	291,108,381,281,290,272,292,305,280, 99,112,257,306,359,113,114,
+ 	264,117,271,374,379,265,266, 93, 94, 95, 85,259,375,260, 90,116,
+ 	377,109,111,277,278,282,283,295,296,297,299,300,301,293,303,307,
+ 	308,310,313,314,315,317,318,319,320,357,322,323,324,325,276,330,
+ 	332,340,365,342,343,344,345,346,356,270,341,368,369,370,371,372 };
+ 
+ #ifdef CONFIG_SPARC
+ static int sparc_l1_a_state;
+ extern void sun_do_break(void);
+ #endif
+ 
+ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
+ 		       unsigned char up_flag)
+ {
+ 	int code;
+ 
+ 	switch (keycode) {
+ 
+ 	case KEY_PAUSE:
+ 		put_queue(vc, 0xe1);
+ 		put_queue(vc, 0x1d | up_flag);
+ 		put_queue(vc, 0x45 | up_flag);
+ 		break;
+ 
+ 	case KEY_HANGEUL:
+ 		if (!up_flag)
+ 			put_queue(vc, 0xf2);
+ 		break;
+ 
+ 	case KEY_HANJA:
+ 		if (!up_flag)
+ 			put_queue(vc, 0xf1);
+ 		break;
+ 
+ 	case KEY_SYSRQ:
+ 		/*
+ 		 * Real AT keyboards (that's what we're trying
+ 		 * to emulate here) emit 0xe0 0x2a 0xe0 0x37 when
+ 		 * pressing PrtSc/SysRq alone, but simply 0x54
+ 		 * when pressing Alt+PrtSc/SysRq.
+ 		 */
+ 		if (test_bit(KEY_LEFTALT, key_down) ||
+ 		    test_bit(KEY_RIGHTALT, key_down)) {
+ 			put_queue(vc, 0x54 | up_flag);
+ 		} else {
+ 			put_queue(vc, 0xe0);
+ 			put_queue(vc, 0x2a | up_flag);
+ 			put_queue(vc, 0xe0);
+ 			put_queue(vc, 0x37 | up_flag);
+ 		}
+ 		break;
+ 
+ 	default:
+ 		if (keycode > 255)
+ 			return -1;
+ 
+ 		code = x86_keycodes[keycode];
+ 		if (!code)
+ 			return -1;
+ 
+ 		if (code & 0x100)
+ 			put_queue(vc, 0xe0);
+ 		put_queue(vc, (code & 0x7f) | up_flag);
+ 
+ 		break;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ #else
+ 
+ #define HW_RAW(dev)	0
+ 
+ static int emulate_raw(struct vc_data *vc, unsigned int keycode, unsigned char up_flag)
+ {
+ 	if (keycode > 127)
+ 		return -1;
+ 
+ 	put_queue(vc, keycode | up_flag);
+ 	return 0;
+ }
+ #endif
+ 
+ static void kbd_rawcode(unsigned char data)
+ {
+ 	struct vc_data *vc = vc_cons[fg_console].d;
+ 
+ 	kbd = kbd_table + vc->vc_num;
+ 	if (kbd->kbdmode == VC_RAW)
+ 		put_queue(vc, data);
+ }
+ 
+ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
+ {
+ 	struct vc_data *vc = vc_cons[fg_console].d;
+ 	unsigned short keysym, *key_map;
+ 	unsigned char type;
+ 	bool raw_mode;
+ 	struct tty_struct *tty;
+ 	int shift_final;
+ 	struct keyboard_notifier_param param = { .vc = vc, .value = keycode, .down = down };
+ 	int rc;
+ 
+ 	tty = vc->port.tty;
+ 
+ 	if (tty && (!tty->driver_data)) {
+ 		/* No driver data? Strange. Okay we fix it then. */
+ 		tty->driver_data = vc;
+ 	}
+ 
+ 	kbd = kbd_table + vc->vc_num;
+ 
+ #ifdef CONFIG_SPARC
+ 	if (keycode == KEY_STOP)
+ 		sparc_l1_a_state = down;
+ #endif
+ 
+ 	rep = (down == 2);
+ 
+ 	raw_mode = (kbd->kbdmode == VC_RAW);
+ 	if (raw_mode && !hw_raw)
+ 		if (emulate_raw(vc, keycode, !down << 7))
+ 			if (keycode < BTN_MISC && printk_ratelimit())
+ 				pr_warn("can't emulate rawmode for keycode %d\n",
+ 					keycode);
+ 
+ #ifdef CONFIG_SPARC
+ 	if (keycode == KEY_A && sparc_l1_a_state) {
+ 		sparc_l1_a_state = false;
+ 		sun_do_break();
+ 	}
+ #endif
+ 
+ 	if (kbd->kbdmode == VC_MEDIUMRAW) {
+ 		/*
+ 		 * This is extended medium raw mode, with keys above 127
+ 		 * encoded as 0, high 7 bits, low 7 bits, with the 0 bearing
+ 		 * the 'up' flag if needed. 0 is reserved, so this shouldn't
+ 		 * interfere with anything else. The two bytes after 0 will
+ 		 * always have the up flag set not to interfere with older
+ 		 * applications. This allows for 16384 different keycodes,
+ 		 * which should be enough.
+ 		 */
+ 		if (keycode < 128) {
+ 			put_queue(vc, keycode | (!down << 7));
+ 		} else {
+ 			put_queue(vc, !down << 7);
+ 			put_queue(vc, (keycode >> 7) | 0x80);
+ 			put_queue(vc, keycode | 0x80);
+ 		}
+ 		raw_mode = true;
+ 	}
+ 
+ 	if (down)
+ 		set_bit(keycode, key_down);
+ 	else
+ 		clear_bit(keycode, key_down);
+ 
+ 	if (rep &&
+ 	    (!vc_kbd_mode(kbd, VC_REPEAT) ||
+ 	     (tty && !L_ECHO(tty) && tty_chars_in_buffer(tty)))) {
+ 		/*
+ 		 * Don't repeat a key if the input buffers are not empty and the
+ 		 * characters get aren't echoed locally. This makes key repeat
+ 		 * usable with slow applications and under heavy loads.
+ 		 */
+ 		return;
+ 	}
+ 
+ 	param.shift = shift_final = (shift_state | kbd->slockstate) ^ kbd->lockstate;
+ 	param.ledstate = kbd->ledflagstate;
+ 	key_map = key_maps[shift_final];
+ 
+ 	rc = atomic_notifier_call_chain(&keyboard_notifier_list,
+ 					KBD_KEYCODE, &param);
+ 	if (rc == NOTIFY_STOP || !key_map) {
+ 		atomic_notifier_call_chain(&keyboard_notifier_list,
+ 					   KBD_UNBOUND_KEYCODE, &param);
+ 		do_compute_shiftstate();
+ 		kbd->slockstate = 0;
+ 		return;
+ 	}
+ 
+ 	if (keycode < NR_KEYS)
+ 		keysym = key_map[keycode];
+ 	else if (keycode >= KEY_BRL_DOT1 && keycode <= KEY_BRL_DOT8)
+ 		keysym = U(K(KT_BRL, keycode - KEY_BRL_DOT1 + 1));
+ 	else
+ 		return;
+ 
+ 	type = KTYP(keysym);
+ 
+ 	if (type < 0xf0) {
+ 		param.value = keysym;
+ 		rc = atomic_notifier_call_chain(&keyboard_notifier_list,
+ 						KBD_UNICODE, &param);
+ 		if (rc != NOTIFY_STOP)
+ 			if (down && !raw_mode)
+ 				k_unicode(vc, keysym, !down);
+ 		return;
+ 	}
+ 
+ 	type -= 0xf0;
+ 
+ 	if (type == KT_LETTER) {
+ 		type = KT_LATIN;
+ 		if (vc_kbd_led(kbd, VC_CAPSLOCK)) {
+ 			key_map = key_maps[shift_final ^ (1 << KG_SHIFT)];
+ 			if (key_map)
+ 				keysym = key_map[keycode];
+ 		}
+ 	}
+ 
+ 	param.value = keysym;
+ 	rc = atomic_notifier_call_chain(&keyboard_notifier_list,
+ 					KBD_KEYSYM, &param);
+ 	if (rc == NOTIFY_STOP)
+ 		return;
+ 
+ 	if ((raw_mode || kbd->kbdmode == VC_OFF) && type != KT_SPEC && type != KT_SHIFT)
+ 		return;
+ 
+ 	(*k_handler[type])(vc, keysym & 0xff, !down);
+ 
+ 	param.ledstate = kbd->ledflagstate;
+ 	atomic_notifier_call_chain(&keyboard_notifier_list, KBD_POST_KEYSYM, &param);
+ 
+ 	if (type != KT_SLOCK)
+ 		kbd->slockstate = 0;
+ }
+ 
+ static void kbd_event(struct input_handle *handle, unsigned int event_type,
+ 		      unsigned int event_code, int value)
+ {
+ 	/* We are called with interrupts disabled, just take the lock */
+ 	spin_lock(&kbd_event_lock);
+ 
+ 	if (event_type == EV_MSC && event_code == MSC_RAW && HW_RAW(handle->dev))
+ 		kbd_rawcode(value);
+ 	if (event_type == EV_KEY && event_code <= KEY_MAX)
+ 		kbd_keycode(event_code, value, HW_RAW(handle->dev));
+ 
+ 	spin_unlock(&kbd_event_lock);
+ 
+ 	tasklet_schedule(&keyboard_tasklet);
+ 	do_poke_blanked_console = 1;
+ 	schedule_console_callback();
+ }
+ 
+ static bool kbd_match(struct input_handler *handler, struct input_dev *dev)
+ {
+ 	int i;
+ 
+ 	if (test_bit(EV_SND, dev->evbit))
+ 		return true;
+ 
+ 	if (test_bit(EV_KEY, dev->evbit)) {
+ 		for (i = KEY_RESERVED; i < BTN_MISC; i++)
+ 			if (test_bit(i, dev->keybit))
+ 				return true;
+ 		for (i = KEY_BRL_DOT1; i <= KEY_BRL_DOT10; i++)
+ 			if (test_bit(i, dev->keybit))
+ 				return true;
+ 	}
+ 
+ 	return false;
+ }
+ 
+ /*
+  * When a keyboard (or other input device) is found, the kbd_connect
+  * function is called. The function then looks at the device, and if it
+  * likes it, it can open it and get events from it. In this (kbd_connect)
+  * function, we should decide which VT to bind that keyboard to initially.
+  */
+ static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+ 			const struct input_device_id *id)
+ {
+ 	struct input_handle *handle;
+ 	int error;
+ 
+ 	handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
+ 	if (!handle)
+ 		return -ENOMEM;
+ 
+ 	handle->dev = dev;
+ 	handle->handler = handler;
+ 	handle->name = "kbd";
+ 
+ 	error = input_register_handle(handle);
+ 	if (error)
+ 		goto err_free_handle;
+ 
+ 	error = input_open_device(handle);
+ 	if (error)
+ 		goto err_unregister_handle;
+ 
+ 	return 0;
+ 
+  err_unregister_handle:
+ 	input_unregister_handle(handle);
+  err_free_handle:
+ 	kfree(handle);
+ 	return error;
+ }
+ 
+ static void kbd_disconnect(struct input_handle *handle)
+ {
+ 	input_close_device(handle);
+ 	input_unregister_handle(handle);
+ 	kfree(handle);
+ }
+ 
+ /*
+  * Start keyboard handler on the new keyboard by refreshing LED state to
+  * match the rest of the system.
+  */
+ static void kbd_start(struct input_handle *handle)
+ {
+ 	tasklet_disable(&keyboard_tasklet);
+ 
+ 	if (ledstate != -1U)
+ 		kbd_update_leds_helper(handle, &ledstate);
+ 
+ 	tasklet_enable(&keyboard_tasklet);
+ }
+ 
+ static const struct input_device_id kbd_ids[] = {
+ 	{
+ 		.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+ 		.evbit = { BIT_MASK(EV_KEY) },
+ 	},
+ 
+ 	{
+ 		.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+ 		.evbit = { BIT_MASK(EV_SND) },
+ 	},
+ 
+ 	{ },    /* Terminating entry */
+ };
+ 
+ MODULE_DEVICE_TABLE(input, kbd_ids);
+ 
+ static struct input_handler kbd_handler = {
+ 	.event		= kbd_event,
+ 	.match		= kbd_match,
+ 	.connect	= kbd_connect,
+ 	.disconnect	= kbd_disconnect,
+ 	.start		= kbd_start,
+ 	.name		= "kbd",
+ 	.id_table	= kbd_ids,
+ };
+ 
+ int __init kbd_init(void)
+ {
+ 	int i;
+ 	int error;
+ 
+ 	for (i = 0; i < MAX_NR_CONSOLES; i++) {
+ 		kbd_table[i].ledflagstate = kbd_defleds();
+ 		kbd_table[i].default_ledflagstate = kbd_defleds();
+ 		kbd_table[i].ledmode = LED_SHOW_FLAGS;
+ 		kbd_table[i].lockstate = KBD_DEFLOCK;
+ 		kbd_table[i].slockstate = 0;
+ 		kbd_table[i].modeflags = KBD_DEFMODE;
+ 		kbd_table[i].kbdmode = default_utf8 ? VC_UNICODE : VC_XLATE;
+ 	}
+ 
+ 	kbd_init_leds();
+ 
+ 	error = input_register_handler(&kbd_handler);
+ 	if (error)
+ 		return error;
+ 
+ 	tasklet_enable(&keyboard_tasklet);
+ 	tasklet_schedule(&keyboard_tasklet);
+ 
+ 	return 0;
+ }
+ 
+ /* Ioctl support code */
+ 
+ /**
+  *	vt_do_diacrit		-	diacritical table updates
+  *	@cmd: ioctl request
+  *	@udp: pointer to user data for ioctl
+  *	@perm: permissions check computed by caller
+  *
+  *	Update the diacritical tables atomically and safely. Lock them
+  *	against simultaneous keypresses
+  */
+ int vt_do_diacrit(unsigned int cmd, void __user *udp, int perm)
+ {
+ 	unsigned long flags;
+ 	int asize;
+ 	int ret = 0;
+ 
+ 	switch (cmd) {
+ 	case KDGKBDIACR:
+ 	{
+ 		struct kbdiacrs __user *a = udp;
+ 		struct kbdiacr *dia;
+ 		int i;
+ 
+ 		dia = kmalloc_array(MAX_DIACR, sizeof(struct kbdiacr),
+ 								GFP_KERNEL);
+ 		if (!dia)
+ 			return -ENOMEM;
+ 
+ 		/* Lock the diacriticals table, make a copy and then
+ 		   copy it after we unlock */
+ 		spin_lock_irqsave(&kbd_event_lock, flags);
+ 
+ 		asize = accent_table_size;
+ 		for (i = 0; i < asize; i++) {
+ 			dia[i].diacr = conv_uni_to_8bit(
+ 						accent_table[i].diacr);
+ 			dia[i].base = conv_uni_to_8bit(
+ 						accent_table[i].base);
+ 			dia[i].result = conv_uni_to_8bit(
+ 						accent_table[i].result);
+ 		}
+ 		spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 
+ 		if (put_user(asize, &a->kb_cnt))
+ 			ret = -EFAULT;
+ 		else  if (copy_to_user(a->kbdiacr, dia,
+ 				asize * sizeof(struct kbdiacr)))
+ 			ret = -EFAULT;
+ 		kfree(dia);
+ 		return ret;
+ 	}
+ 	case KDGKBDIACRUC:
+ 	{
+ 		struct kbdiacrsuc __user *a = udp;
+ 		void *buf;
+ 
+ 		buf = kmalloc_array(MAX_DIACR, sizeof(struct kbdiacruc),
+ 								GFP_KERNEL);
+ 		if (buf == NULL)
+ 			return -ENOMEM;
+ 
+ 		/* Lock the diacriticals table, make a copy and then
+ 		   copy it after we unlock */
+ 		spin_lock_irqsave(&kbd_event_lock, flags);
+ 
+ 		asize = accent_table_size;
+ 		memcpy(buf, accent_table, asize * sizeof(struct kbdiacruc));
+ 
+ 		spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 
+ 		if (put_user(asize, &a->kb_cnt))
+ 			ret = -EFAULT;
+ 		else if (copy_to_user(a->kbdiacruc, buf,
+ 				asize*sizeof(struct kbdiacruc)))
+ 			ret = -EFAULT;
+ 		kfree(buf);
+ 		return ret;
+ 	}
+ 
+ 	case KDSKBDIACR:
+ 	{
+ 		struct kbdiacrs __user *a = udp;
+ 		struct kbdiacr *dia = NULL;
+ 		unsigned int ct;
+ 		int i;
+ 
+ 		if (!perm)
+ 			return -EPERM;
+ 		if (get_user(ct, &a->kb_cnt))
+ 			return -EFAULT;
+ 		if (ct >= MAX_DIACR)
+ 			return -EINVAL;
+ 
+ 		if (ct) {
+ 
+ 			dia = memdup_user(a->kbdiacr,
+ 					sizeof(struct kbdiacr) * ct);
+ 			if (IS_ERR(dia))
+ 				return PTR_ERR(dia);
+ 
+ 		}
+ 
+ 		spin_lock_irqsave(&kbd_event_lock, flags);
+ 		accent_table_size = ct;
+ 		for (i = 0; i < ct; i++) {
+ 			accent_table[i].diacr =
+ 					conv_8bit_to_uni(dia[i].diacr);
+ 			accent_table[i].base =
+ 					conv_8bit_to_uni(dia[i].base);
+ 			accent_table[i].result =
+ 					conv_8bit_to_uni(dia[i].result);
+ 		}
+ 		spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 		kfree(dia);
+ 		return 0;
+ 	}
+ 
+ 	case KDSKBDIACRUC:
+ 	{
+ 		struct kbdiacrsuc __user *a = udp;
+ 		unsigned int ct;
+ 		void *buf = NULL;
+ 
+ 		if (!perm)
+ 			return -EPERM;
+ 
+ 		if (get_user(ct, &a->kb_cnt))
+ 			return -EFAULT;
+ 
+ 		if (ct >= MAX_DIACR)
+ 			return -EINVAL;
+ 
+ 		if (ct) {
+ 			buf = memdup_user(a->kbdiacruc,
+ 					  ct * sizeof(struct kbdiacruc));
+ 			if (IS_ERR(buf))
+ 				return PTR_ERR(buf);
+ 		} 
+ 		spin_lock_irqsave(&kbd_event_lock, flags);
+ 		if (ct)
+ 			memcpy(accent_table, buf,
+ 					ct * sizeof(struct kbdiacruc));
+ 		accent_table_size = ct;
+ 		spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 		kfree(buf);
+ 		return 0;
+ 	}
+ 	}
+ 	return ret;
+ }
+ 
+ /**
+  *	vt_do_kdskbmode		-	set keyboard mode ioctl
+  *	@console: the console to use
+  *	@arg: the requested mode
+  *
+  *	Update the keyboard mode bits while holding the correct locks.
+  *	Return 0 for success or an error code.
+  */
+ int vt_do_kdskbmode(int console, unsigned int arg)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	int ret = 0;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	switch(arg) {
+ 	case K_RAW:
+ 		kb->kbdmode = VC_RAW;
+ 		break;
+ 	case K_MEDIUMRAW:
+ 		kb->kbdmode = VC_MEDIUMRAW;
+ 		break;
+ 	case K_XLATE:
+ 		kb->kbdmode = VC_XLATE;
+ 		do_compute_shiftstate();
+ 		break;
+ 	case K_UNICODE:
+ 		kb->kbdmode = VC_UNICODE;
+ 		do_compute_shiftstate();
+ 		break;
+ 	case K_OFF:
+ 		kb->kbdmode = VC_OFF;
+ 		break;
+ 	default:
+ 		ret = -EINVAL;
+ 	}
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 	return ret;
+ }
+ 
+ /**
+  *	vt_do_kdskbmeta		-	set keyboard meta state
+  *	@console: the console to use
+  *	@arg: the requested meta state
+  *
+  *	Update the keyboard meta bits while holding the correct locks.
+  *	Return 0 for success or an error code.
+  */
+ int vt_do_kdskbmeta(int console, unsigned int arg)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	int ret = 0;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	switch(arg) {
+ 	case K_METABIT:
+ 		clr_vc_kbd_mode(kb, VC_META);
+ 		break;
+ 	case K_ESCPREFIX:
+ 		set_vc_kbd_mode(kb, VC_META);
+ 		break;
+ 	default:
+ 		ret = -EINVAL;
+ 	}
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 	return ret;
+ }
+ 
+ int vt_do_kbkeycode_ioctl(int cmd, struct kbkeycode __user *user_kbkc,
+ 								int perm)
+ {
+ 	struct kbkeycode tmp;
+ 	int kc = 0;
+ 
+ 	if (copy_from_user(&tmp, user_kbkc, sizeof(struct kbkeycode)))
+ 		return -EFAULT;
+ 	switch (cmd) {
+ 	case KDGETKEYCODE:
+ 		kc = getkeycode(tmp.scancode);
+ 		if (kc >= 0)
+ 			kc = put_user(kc, &user_kbkc->keycode);
+ 		break;
+ 	case KDSETKEYCODE:
+ 		if (!perm)
+ 			return -EPERM;
+ 		kc = setkeycode(tmp.scancode, tmp.keycode);
+ 		break;
+ 	}
+ 	return kc;
+ }
+ 
+ #define i (tmp.kb_index)
+ #define s (tmp.kb_table)
+ #define v (tmp.kb_value)
+ 
+ int vt_do_kdsk_ioctl(int cmd, struct kbentry __user *user_kbe, int perm,
+ 						int console)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	struct kbentry tmp;
+ 	ushort *key_map, *new_map, val, ov;
+ 	unsigned long flags;
+ 
+ 	if (copy_from_user(&tmp, user_kbe, sizeof(struct kbentry)))
+ 		return -EFAULT;
+ 
+ 	if (!capable(CAP_SYS_TTY_CONFIG))
+ 		perm = 0;
+ 
+ 	switch (cmd) {
+ 	case KDGKBENT:
+ 		/* Ensure another thread doesn't free it under us */
+ 		spin_lock_irqsave(&kbd_event_lock, flags);
+ 		key_map = key_maps[s];
+ 		if (key_map) {
+ 		    val = U(key_map[i]);
+ 		    if (kb->kbdmode != VC_UNICODE && KTYP(val) >= NR_TYPES)
+ 			val = K_HOLE;
+ 		} else
+ 		    val = (i ? K_HOLE : K_NOSUCHMAP);
+ 		spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 		return put_user(val, &user_kbe->kb_value);
+ 	case KDSKBENT:
+ 		if (!capable(CAP_SYS_TTY_CONFIG))
+ 			perm = 0;
+ 		if (!perm)
+ 			return -EPERM;
+ 		if (!i && v == K_NOSUCHMAP) {
+ 			spin_lock_irqsave(&kbd_event_lock, flags);
+ 			/* deallocate map */
+ 			key_map = key_maps[s];
+ 			if (s && key_map) {
+ 			    key_maps[s] = NULL;
+ 			    if (key_map[0] == U(K_ALLOCATED)) {
+ 					kfree(key_map);
+ 					keymap_count--;
+ 			    }
+ 			}
+ 			spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 			break;
+ 		}
+ 
+ 		if (KTYP(v) < NR_TYPES) {
+ 		    if (KVAL(v) > max_vals[KTYP(v)])
+ 				return -EINVAL;
+ 		} else
+ 		    if (kb->kbdmode != VC_UNICODE)
+ 				return -EINVAL;
+ 
+ 		/* ++Geert: non-PC keyboards may generate keycode zero */
+ #if !defined(__mc68000__) && !defined(__powerpc__)
+ 		/* assignment to entry 0 only tests validity of args */
+ 		if (!i)
+ 			break;
+ #endif
+ 
+ 		new_map = kmalloc(sizeof(plain_map), GFP_KERNEL);
+ 		if (!new_map)
+ 			return -ENOMEM;
+ 		spin_lock_irqsave(&kbd_event_lock, flags);
+ 		key_map = key_maps[s];
+ 		if (key_map == NULL) {
+ 			int j;
+ 
+ 			if (keymap_count >= MAX_NR_OF_USER_KEYMAPS &&
+ 			    !capable(CAP_SYS_RESOURCE)) {
+ 				spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 				kfree(new_map);
+ 				return -EPERM;
+ 			}
+ 			key_maps[s] = new_map;
+ 			key_map = new_map;
+ 			key_map[0] = U(K_ALLOCATED);
+ 			for (j = 1; j < NR_KEYS; j++)
+ 				key_map[j] = U(K_HOLE);
+ 			keymap_count++;
+ 		} else
+ 			kfree(new_map);
+ 
+ 		ov = U(key_map[i]);
+ 		if (v == ov)
+ 			goto out;
+ 		/*
+ 		 * Attention Key.
+ 		 */
+ 		if (((ov == K_SAK) || (v == K_SAK)) && !capable(CAP_SYS_ADMIN)) {
+ 			spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 			return -EPERM;
+ 		}
+ 		key_map[i] = U(v);
+ 		if (!s && (KTYP(ov) == KT_SHIFT || KTYP(v) == KT_SHIFT))
+ 			do_compute_shiftstate();
+ out:
+ 		spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 		break;
+ 	}
+ 	return 0;
+ }
+ #undef i
+ #undef s
+ #undef v
+ 
+ /* FIXME: This one needs untangling */
+ int vt_do_kdgkb_ioctl(int cmd, struct kbsentry __user *user_kdgkb, int perm)
+ {
+ 	struct kbsentry *kbs;
+ 	u_char *q;
+ 	int sz, fnw_sz;
+ 	int delta;
+ 	char *first_free, *fj, *fnw;
+ 	int i, j, k;
+ 	int ret;
+ 	unsigned long flags;
+ 
+ 	if (!capable(CAP_SYS_TTY_CONFIG))
+ 		perm = 0;
+ 
+ 	kbs = kmalloc(sizeof(*kbs), GFP_KERNEL);
+ 	if (!kbs) {
+ 		ret = -ENOMEM;
+ 		goto reterr;
+ 	}
+ 
+ 	/* we mostly copy too much here (512bytes), but who cares ;) */
+ 	if (copy_from_user(kbs, user_kdgkb, sizeof(struct kbsentry))) {
+ 		ret = -EFAULT;
+ 		goto reterr;
+ 	}
+ 	kbs->kb_string[sizeof(kbs->kb_string)-1] = '\0';
+ 	i = kbs->kb_func;
+ 
+ 	switch (cmd) {
+ 	case KDGKBSENT: {
+ 		/* size should have been a struct member */
+ 		ssize_t len = sizeof(user_kdgkb->kb_string);
+ 
+ 		spin_lock_irqsave(&func_buf_lock, flags);
+ 		len = strlcpy(kbs->kb_string, func_table[i] ? : "", len);
+ 		spin_unlock_irqrestore(&func_buf_lock, flags);
+ 
+ 		ret = copy_to_user(user_kdgkb->kb_string, kbs->kb_string,
+ 				len + 1) ? -EFAULT : 0;
+ 
+ 		goto reterr;
+ 	}
+ 	case KDSKBSENT:
+ 		if (!perm) {
+ 			ret = -EPERM;
+ 			goto reterr;
+ 		}
+ 
+ 		fnw = NULL;
+ 		fnw_sz = 0;
+ 		/* race aginst other writers */
+ 		again:
+ 		spin_lock_irqsave(&func_buf_lock, flags);
+ 		q = func_table[i];
+ 
+ 		/* fj pointer to next entry after 'q' */
+ 		first_free = funcbufptr + (funcbufsize - funcbufleft);
+ 		for (j = i+1; j < MAX_NR_FUNC && !func_table[j]; j++)
+ 			;
+ 		if (j < MAX_NR_FUNC)
+ 			fj = func_table[j];
+ 		else
+ 			fj = first_free;
+ 		/* buffer usage increase by new entry */
+ 		delta = (q ? -strlen(q) : 1) + strlen(kbs->kb_string);
+ 
+ 		if (delta <= funcbufleft) { 	/* it fits in current buf */
+ 		    if (j < MAX_NR_FUNC) {
+ 			/* make enough space for new entry at 'fj' */
+ 			memmove(fj + delta, fj, first_free - fj);
+ 			for (k = j; k < MAX_NR_FUNC; k++)
+ 			    if (func_table[k])
+ 				func_table[k] += delta;
+ 		    }
+ 		    if (!q)
+ 		      func_table[i] = fj;
+ 		    funcbufleft -= delta;
+ 		} else {			/* allocate a larger buffer */
+ 		    sz = 256;
+ 		    while (sz < funcbufsize - funcbufleft + delta)
+ 		      sz <<= 1;
+ 		    if (fnw_sz != sz) {
+ 		      spin_unlock_irqrestore(&func_buf_lock, flags);
+ 		      kfree(fnw);
+ 		      fnw = kmalloc(sz, GFP_KERNEL);
+ 		      fnw_sz = sz;
+ 		      if (!fnw) {
+ 			ret = -ENOMEM;
+ 			goto reterr;
+ 		      }
+ 		      goto again;
+ 		    }
+ 
+ 		    if (!q)
+ 		      func_table[i] = fj;
+ 		    /* copy data before insertion point to new location */
+ 		    if (fj > funcbufptr)
+ 			memmove(fnw, funcbufptr, fj - funcbufptr);
+ 		    for (k = 0; k < j; k++)
+ 		      if (func_table[k])
+ 			func_table[k] = fnw + (func_table[k] - funcbufptr);
+ 
+ 		    /* copy data after insertion point to new location */
+ 		    if (first_free > fj) {
+ 			memmove(fnw + (fj - funcbufptr) + delta, fj, first_free - fj);
+ 			for (k = j; k < MAX_NR_FUNC; k++)
+ 			  if (func_table[k])
+ 			    func_table[k] = fnw + (func_table[k] - funcbufptr) + delta;
+ 		    }
+ 		    if (funcbufptr != func_buf)
+ 		      kfree(funcbufptr);
+ 		    funcbufptr = fnw;
+ 		    funcbufleft = funcbufleft - delta + sz - funcbufsize;
+ 		    funcbufsize = sz;
+ 		}
+ 		/* finally insert item itself */
+ 		strcpy(func_table[i], kbs->kb_string);
+ 		spin_unlock_irqrestore(&func_buf_lock, flags);
+ 		break;
+ 	}
+ 	ret = 0;
+ reterr:
+ 	kfree(kbs);
+ 	return ret;
+ }
+ 
+ int vt_do_kdskled(int console, int cmd, unsigned long arg, int perm)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+         unsigned long flags;
+ 	unsigned char ucval;
+ 
+         switch(cmd) {
+ 	/* the ioctls below read/set the flags usually shown in the leds */
+ 	/* don't use them - they will go away without warning */
+ 	case KDGKBLED:
+                 spin_lock_irqsave(&kbd_event_lock, flags);
+ 		ucval = kb->ledflagstate | (kb->default_ledflagstate << 4);
+                 spin_unlock_irqrestore(&kbd_event_lock, flags);
+ 		return put_user(ucval, (char __user *)arg);
+ 
+ 	case KDSKBLED:
+ 		if (!perm)
+ 			return -EPERM;
+ 		if (arg & ~0x77)
+ 			return -EINVAL;
+                 spin_lock_irqsave(&led_lock, flags);
+ 		kb->ledflagstate = (arg & 7);
+ 		kb->default_ledflagstate = ((arg >> 4) & 7);
+ 		set_leds();
+                 spin_unlock_irqrestore(&led_lock, flags);
+ 		return 0;
+ 
+ 	/* the ioctls below only set the lights, not the functions */
+ 	/* for those, see KDGKBLED and KDSKBLED above */
+ 	case KDGETLED:
+ 		ucval = getledstate();
+ 		return put_user(ucval, (char __user *)arg);
+ 
+ 	case KDSETLED:
+ 		if (!perm)
+ 			return -EPERM;
+ 		setledstate(kb, arg);
+ 		return 0;
+         }
+         return -ENOIOCTLCMD;
+ }
+ 
+ int vt_do_kdgkbmode(int console)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	/* This is a spot read so needs no locking */
+ 	switch (kb->kbdmode) {
+ 	case VC_RAW:
+ 		return K_RAW;
+ 	case VC_MEDIUMRAW:
+ 		return K_MEDIUMRAW;
+ 	case VC_UNICODE:
+ 		return K_UNICODE;
+ 	case VC_OFF:
+ 		return K_OFF;
+ 	default:
+ 		return K_XLATE;
+ 	}
+ }
+ 
+ /**
+  *	vt_do_kdgkbmeta		-	report meta status
+  *	@console: console to report
+  *
+  *	Report the meta flag status of this console
+  */
+ int vt_do_kdgkbmeta(int console)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+         /* Again a spot read so no locking */
+ 	return vc_kbd_mode(kb, VC_META) ? K_ESCPREFIX : K_METABIT;
+ }
+ 
+ /**
+  *	vt_reset_unicode	-	reset the unicode status
+  *	@console: console being reset
+  *
+  *	Restore the unicode console state to its default
+  */
+ void vt_reset_unicode(int console)
+ {
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	kbd_table[console].kbdmode = default_utf8 ? VC_UNICODE : VC_XLATE;
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ }
+ 
+ /**
+  *	vt_get_shiftstate	-	shift bit state
+  *
+  *	Report the shift bits from the keyboard state. We have to export
+  *	this to support some oddities in the vt layer.
+  */
+ int vt_get_shift_state(void)
+ {
+         /* Don't lock as this is a transient report */
+         return shift_state;
+ }
+ 
+ /**
+  *	vt_reset_keyboard	-	reset keyboard state
+  *	@console: console to reset
+  *
+  *	Reset the keyboard bits for a console as part of a general console
+  *	reset event
+  */
+ void vt_reset_keyboard(int console)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	set_vc_kbd_mode(kb, VC_REPEAT);
+ 	clr_vc_kbd_mode(kb, VC_CKMODE);
+ 	clr_vc_kbd_mode(kb, VC_APPLIC);
+ 	clr_vc_kbd_mode(kb, VC_CRLF);
+ 	kb->lockstate = 0;
+ 	kb->slockstate = 0;
+ 	spin_lock(&led_lock);
+ 	kb->ledmode = LED_SHOW_FLAGS;
+ 	kb->ledflagstate = kb->default_ledflagstate;
+ 	spin_unlock(&led_lock);
+ 	/* do not do set_leds here because this causes an endless tasklet loop
+ 	   when the keyboard hasn't been initialized yet */
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ }
+ 
+ /**
+  *	vt_get_kbd_mode_bit	-	read keyboard status bits
+  *	@console: console to read from
+  *	@bit: mode bit to read
+  *
+  *	Report back a vt mode bit. We do this without locking so the
+  *	caller must be sure that there are no synchronization needs
+  */
+ 
+ int vt_get_kbd_mode_bit(int console, int bit)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	return vc_kbd_mode(kb, bit);
+ }
+ 
+ /**
+  *	vt_set_kbd_mode_bit	-	read keyboard status bits
+  *	@console: console to read from
+  *	@bit: mode bit to read
+  *
+  *	Set a vt mode bit. We do this without locking so the
+  *	caller must be sure that there are no synchronization needs
+  */
+ 
+ void vt_set_kbd_mode_bit(int console, int bit)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	set_vc_kbd_mode(kb, bit);
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ }
+ 
+ /**
+  *	vt_clr_kbd_mode_bit	-	read keyboard status bits
+  *	@console: console to read from
+  *	@bit: mode bit to read
+  *
+  *	Report back a vt mode bit. We do this without locking so the
+  *	caller must be sure that there are no synchronization needs
+  */
+ 
+ void vt_clr_kbd_mode_bit(int console, int bit)
+ {
+ 	struct kbd_struct *kb = kbd_table + console;
+ 	unsigned long flags;
+ 
+ 	spin_lock_irqsave(&kbd_event_lock, flags);
+ 	clr_vc_kbd_mode(kb, bit);
+ 	spin_unlock_irqrestore(&kbd_event_lock, flags);
+ }
diff --color -rcNP Master/drivers/tty/vt/keyboard.c.rej OG/drivers/tty/vt/keyboard.c.rej
*** Master/drivers/tty/vt/keyboard.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/drivers/tty/vt/keyboard.c.rej	2021-04-20 15:11:27.311000000 -0400
***************
*** 0 ****
--- 1,19 ----
+ *** drivers/tty/vt/keyboard.c	2021-03-13 13:52:21.000000000 +0200
+ --- drivers/tty/vt/keyboard.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 1780,1786 ****
+   					  ct * sizeof(struct kbdiacruc));
+   			if (IS_ERR(buf))
+   				return PTR_ERR(buf);
+ ! 		}
+   		spin_lock_irqsave(&kbd_event_lock, flags);
+   		if (ct)
+   			memcpy(accent_table, buf,
+ --- 1772,1778 ----
+   					  ct * sizeof(struct kbdiacruc));
+   			if (IS_ERR(buf))
+   				return PTR_ERR(buf);
+ ! 		}
+   		spin_lock_irqsave(&kbd_event_lock, flags);
+   		if (ct)
+   			memcpy(accent_table, buf,
diff --color -rcNP Master/fs/binfmt_elf.c OG/fs/binfmt_elf.c
*** Master/fs/binfmt_elf.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/binfmt_elf.c	2021-04-20 15:11:34.506000000 -0400
***************
*** 42,47 ****
--- 42,48 ----
  #include <linux/cred.h>
  #include <linux/dax.h>
  #include <linux/uaccess.h>
+ #include <linux/xattr.h>
  #include <asm/param.h>
  #include <asm/page.h>
  
***************
*** 75,80 ****
--- 76,85 ----
  #define elf_core_dump	NULL
  #endif
  
+ #ifdef CONFIG_MINISEC_MPROTECT
+ static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags);
+ #endif
+ 
  #if ELF_EXEC_PAGESIZE > PAGE_SIZE
  #define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
  #else
***************
*** 94,99 ****
--- 99,107 ----
  	.load_binary	= load_elf_binary,
  	.load_shlib	= load_elf_library,
  	.core_dump	= elf_core_dump,
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	.handle_mprotect= elf_handle_mprotect,
+ #endif
  	.min_coredump	= ELF_EXEC_PAGESIZE,
  };
  
***************
*** 235,241 ****
  	} while (0)
  
  #ifdef ARCH_DLINFO
! 	/* 
  	 * ARCH_DLINFO must come first so PPC can do its special alignment of
  	 * AUXV.
  	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
--- 243,249 ----
  	} while (0)
  
  #ifdef ARCH_DLINFO
! 	/*
  	 * ARCH_DLINFO must come first so PPC can do its special alignment of
  	 * AUXV.
  	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
***************
*** 665,670 ****
--- 673,1008 ----
  	return error;
  }
  
+ #ifdef CONFIG_MINISEC_PT_PAX_FLAGS
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ static unsigned long pax_parse_pt_pax_softmode(const struct elf_phdr * const elf_phdata)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (elf_phdata->p_flags & PF_PAGEEXEC)
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (elf_phdata->p_flags & PF_SEGMEXEC)
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if ((elf_phdata->p_flags & PF_EMUTRAMP) && (pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (elf_phdata->p_flags & PF_MPROTECT)
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && (elf_phdata->p_flags & PF_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ static unsigned long pax_parse_pt_pax_hardmode(const struct elf_phdr * const elf_phdata)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (!(elf_phdata->p_flags & PF_NOPAGEEXEC))
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (!(elf_phdata->p_flags & PF_NOSEGMEXEC))
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if (!(elf_phdata->p_flags & PF_NOEMUTRAMP))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (!(elf_phdata->p_flags & PF_NOMPROTECT))
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && !(elf_phdata->p_flags & PF_NORANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ static unsigned long pax_parse_xattr_pax_softmode(unsigned long pax_flags_softmode)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (pax_flags_softmode & MF_PAX_PAGEEXEC)
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (pax_flags_softmode & MF_PAX_SEGMEXEC)
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if ((pax_flags_softmode & MF_PAX_EMUTRAMP) && (pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (pax_flags_softmode & MF_PAX_MPROTECT)
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && (pax_flags_softmode & MF_PAX_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ static unsigned long pax_parse_xattr_pax_hardmode(unsigned long pax_flags_hardmode)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (!(pax_flags_hardmode & MF_PAX_PAGEEXEC))
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (!(pax_flags_hardmode & MF_PAX_SEGMEXEC))
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if (!(pax_flags_hardmode & MF_PAX_EMUTRAMP))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (!(pax_flags_hardmode & MF_PAX_MPROTECT))
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && !(pax_flags_hardmode & MF_PAX_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ static unsigned long pax_parse_defaults(void)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	if (pax_softmode)
+ 		return pax_flags;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_RANDMMAP
+ 	if (randomize_va_space)
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ 
+ static unsigned long pax_parse_ei_pax(const struct elfhdr * const elf_ex)
+ {
+ 	unsigned long pax_flags = PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #ifdef CONFIG_MINISEC_EI_PAX
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	if (pax_softmode)
+ 		return pax_flags;
+ #endif
+ 
+ 	pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_PAGEEXEC))
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_SEGMEXEC))
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && (elf_ex->e_ident[EI_PAX] & EF_PAX_EMUTRAMP))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && !(elf_ex->e_ident[EI_PAX] & EF_PAX_MPROTECT))
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	if (randomize_va_space && !(elf_ex->e_ident[EI_PAX] & EF_PAX_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ #endif
+ 
+ 	return pax_flags;
+ 
+ }
+ 
+ static unsigned long pax_parse_pt_pax(const struct elfhdr * const elf_ex, const struct elf_phdr * const elf_phdata)
+ {
+ 
+ #ifdef CONFIG_MINISEC_PT_PAX_FLAGS
+ 	unsigned long i;
+ 
+ 	for (i = 0UL; i < elf_ex->e_phnum; i++)
+ 		if (elf_phdata[i].p_type == PT_PAX_FLAGS) {
+ 			if (((elf_phdata[i].p_flags & PF_PAGEEXEC) && (elf_phdata[i].p_flags & PF_NOPAGEEXEC)) ||
+ 			    ((elf_phdata[i].p_flags & PF_SEGMEXEC) && (elf_phdata[i].p_flags & PF_NOSEGMEXEC)) ||
+ 			    ((elf_phdata[i].p_flags & PF_EMUTRAMP) && (elf_phdata[i].p_flags & PF_NOEMUTRAMP)) ||
+ 			    ((elf_phdata[i].p_flags & PF_MPROTECT) && (elf_phdata[i].p_flags & PF_NOMPROTECT)) ||
+ 			    ((elf_phdata[i].p_flags & PF_RANDMMAP) && (elf_phdata[i].p_flags & PF_NORANDMMAP)))
+ 				return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 			if (pax_softmode)
+ 				return pax_parse_pt_pax_softmode(&elf_phdata[i]);
+ 			else
+ #endif
+ 
+ 				return pax_parse_pt_pax_hardmode(&elf_phdata[i]);
+ 			break;
+ 		}
+ #endif
+ 
+ 	return PAX_PARSE_FLAGS_FALLBACK;
+ }
+ 
+ static unsigned long pax_parse_xattr_pax(struct file * const file)
+ {
+ 
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ 	ssize_t xattr_size, i;
+ 	unsigned char xattr_value[sizeof("pemrs") - 1];
+ 	unsigned long pax_flags_hardmode = 0UL, pax_flags_softmode = 0UL;
+ 
+ 	xattr_size = pax_getxattr(file->f_path.dentry, xattr_value, sizeof xattr_value);
+ 	if (xattr_size < 0 || xattr_size > sizeof xattr_value)
+ 		return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ 	for (i = 0; i < xattr_size; i++)
+ 		switch (xattr_value[i]) {
+ 		default:
+ 			return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #define parse_flag(option1, option2, flag)			\
+ 		case option1:					\
+ 			if (pax_flags_hardmode & MF_PAX_##flag)	\
+ 				return PAX_PARSE_FLAGS_FALLBACK;\
+ 			pax_flags_hardmode |= MF_PAX_##flag;	\
+ 			break;					\
+ 		case option2:					\
+ 			if (pax_flags_softmode & MF_PAX_##flag)	\
+ 				return PAX_PARSE_FLAGS_FALLBACK;\
+ 			pax_flags_softmode |= MF_PAX_##flag;	\
+ 			break;
+ 
+ 		parse_flag('p', 'P', PAGEEXEC);
+ 		parse_flag('e', 'E', EMUTRAMP);
+ 		parse_flag('m', 'M', MPROTECT);
+ 		parse_flag('r', 'R', RANDMMAP);
+ 		parse_flag('s', 'S', SEGMEXEC);
+ 
+ #undef parse_flag
+ 		}
+ 
+ 	if (pax_flags_hardmode & pax_flags_softmode)
+ 		return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	if (pax_softmode)
+ 		return pax_parse_xattr_pax_softmode(pax_flags_softmode);
+ 	else
+ #endif
+ 
+ 		return pax_parse_xattr_pax_hardmode(pax_flags_hardmode);
+ #else
+ 	return PAX_PARSE_FLAGS_FALLBACK;
+ #endif
+ 
+ }
+ 
+ static long pax_parse_pax_flags(const struct elfhdr * const elf_ex, const struct elf_phdr * const elf_phdata, struct file * const file)
+ {
+ 	unsigned long pax_flags, ei_pax_flags,  pt_pax_flags, xattr_pax_flags;
+ 
+ 	pax_flags = pax_parse_defaults();
+ 	ei_pax_flags = pax_parse_ei_pax(elf_ex);
+ 	pt_pax_flags = pax_parse_pt_pax(elf_ex, elf_phdata);
+ 	xattr_pax_flags = pax_parse_xattr_pax(file);
+ 
+ 	if (pt_pax_flags != PAX_PARSE_FLAGS_FALLBACK &&
+ 	    xattr_pax_flags != PAX_PARSE_FLAGS_FALLBACK &&
+ 	    pt_pax_flags != xattr_pax_flags)
+ 		return -EINVAL;
+ 	if (xattr_pax_flags != PAX_PARSE_FLAGS_FALLBACK)
+ 		pax_flags = xattr_pax_flags;
+ 	else if (pt_pax_flags != PAX_PARSE_FLAGS_FALLBACK)
+ 		pax_flags = pt_pax_flags;
+ 	else if (ei_pax_flags != PAX_PARSE_FLAGS_FALLBACK)
+ 		pax_flags = ei_pax_flags;
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_MINISEC_SEGMEXEC)
+ 	if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) {
+ 		if ((__supported_pte_mask & _PAGE_NX))
+ 			pax_flags &= ~MF_PAX_SEGMEXEC;
+ 		else
+ 			pax_flags &= ~MF_PAX_PAGEEXEC;
+ 	}
+ #endif
+ 
+ 	if (0 > pax_check_flags(&pax_flags))
+ 		return -EINVAL;
+ 
+ 	current->mm->pax_flags = pax_flags;
+ 	return 0;
+ }
+ #endif
+ 
  /*
   * These are the functions used to load ELF style executables and shared
   * libraries.  There is no binary dependent code anywhere else.
***************
*** 697,703 ****
  		retval = -ENOMEM;
  		goto out_ret;
  	}
! 	
  	/* Get the exec-header */
  	loc->elf_ex = *((struct elfhdr *)bprm->buf);
  
--- 1035,1041 ----
  		retval = -ENOMEM;
  		goto out_ret;
  	}
! 
  	/* Get the exec-header */
  	loc->elf_ex = *((struct elfhdr *)bprm->buf);
  
***************
*** 851,856 ****
--- 1189,1232 ----
  	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
  	   may depend on the personality.  */
  	SET_PERSONALITY2(loc->elf_ex, &arch_state);
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ 	current->mm->pax_flags = 0UL;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(X86_32)
+ 	if ((current->mm->pax_flags & MF_PAX_PAGEEXEC) && !(__supported_pte_mask & _PAGE_NX)) {
+ 		current->mm->context.user_cs_limit = PAGE_SIZE;
+ 		current->mm->def_flags |= VM_PAGEEXEC | VM_NOHUGEPAGE;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	current->mm->delta_mmap = 0UL;
+ 	current->mm->delta_stack = 0UL;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ 	if (0 > pax_parse_pax_flags(elf_ex, elf_phdata, bprm->file)) {
+ 		retval = -EINVAL;
+ 		goto out_free_dentry;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	if (current->mm->pax_flags & MF_PAX_RANDMMAP) {
+ 		current->mm->delta_mmap = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN)-1)) << PAGE_SHIFT;
+ 		current->mm->delta_stack = (pax_get_random_long() & ((1UL << PAX_DELTA_STACK_LEN)-1)) << PAGE_SHIFT;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (current->mm->pax_flags & MF_PAX_PAGEEXEC) {
+ 		executable_stack = EXSTACK_DISABLE_X;
+ 		current->personality &= ~READ_IMPLIES_EXEC;
+ 	} else
+ #endif
+ 
  	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
  		current->personality |= READ_IMPLIES_EXEC;
  
***************
*** 866,872 ****
  				 executable_stack);
  	if (retval < 0)
  		goto out_free_dentry;
! 	
  	elf_bss = 0;
  	elf_brk = 0;
  
--- 1242,1248 ----
  				 executable_stack);
  	if (retval < 0)
  		goto out_free_dentry;
! 
  	elf_bss = 0;
  	elf_brk = 0;
  
***************
*** 888,894 ****
  
  		if (unlikely (elf_brk > elf_bss)) {
  			unsigned long nbyte;
! 	            
  			/* There was a PT_LOAD segment with p_memsz > p_filesz
  			   before this one. Map anonymous pages, if needed,
  			   and clear the area.  */
--- 1264,1270 ----
  
  		if (unlikely (elf_brk > elf_bss)) {
  			unsigned long nbyte;
! 
  			/* There was a PT_LOAD segment with p_memsz > p_filesz
  			   before this one. Map anonymous pages, if needed,
  			   and clear the area.  */
***************
*** 1456,1462 ****
  	phdr->p_align = 0;
  }
  
! static void fill_note(struct memelfnote *note, const char *name, int type, 
  		unsigned int sz, void *data)
  {
  	note->name = name;
--- 1832,1838 ----
  	phdr->p_align = 0;
  }
  
! static void fill_note(struct memelfnote *note, const char *name, int type,
  		unsigned int sz, void *data)
  {
  	note->name = name;
***************
*** 1508,1514 ****
  {
  	const struct cred *cred;
  	unsigned int i, len;
! 	
  	/* first copy the parameters from user space */
  	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
  
--- 1884,1890 ----
  {
  	const struct cred *cred;
  	unsigned int i, len;
! 
  	/* first copy the parameters from user space */
  	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
  
***************
*** 1542,1548 ****
  	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
  	rcu_read_unlock();
  	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
! 	
  	return 0;
  }
  
--- 1918,1924 ----
  	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
  	rcu_read_unlock();
  	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
! 
  	return 0;
  }
  
***************
*** 1939,1946 ****
  	t->num_notes = 0;
  
  	fill_prstatus(&t->prstatus, p, signr);
! 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
! 	
  	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
  		  &(t->prstatus));
  	t->num_notes++;
--- 2315,2322 ----
  	t->num_notes = 0;
  
  	fill_prstatus(&t->prstatus, p, signr);
! 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
! 
  	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
  		  &(t->prstatus));
  	t->num_notes++;
***************
*** 1961,1967 ****
  		t->num_notes++;
  		sz += notesize(&t->notes[2]);
  	}
! #endif	
  	return sz;
  }
  
--- 2337,2343 ----
  		t->num_notes++;
  		sz += notesize(&t->notes[2]);
  	}
! #endif
  	return sz;
  }
  
***************
*** 2199,2205 ****
  
  	/*
  	 * We no longer stop all VM operations.
! 	 * 
  	 * This is because those proceses that could possibly change map_count
  	 * or the mmap / vma pages are now blocked in do_exit on current
  	 * finishing this core dump.
--- 2575,2581 ----
  
  	/*
  	 * We no longer stop all VM operations.
! 	 *
  	 * This is because those proceses that could possibly change map_count
  	 * or the mmap / vma pages are now blocked in do_exit on current
  	 * finishing this core dump.
***************
*** 2208,2214 ****
  	 * the map_count or the pages allocated. So no possibility of crashing
  	 * exists while dumping the mm->vm_next areas to the core file.
  	 */
!   
  	/* alloc memory for large data structures: too large to be on stack */
  	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
  	if (!elf)
--- 2584,2590 ----
  	 * the map_count or the pages allocated. So no possibility of crashing
  	 * exists while dumping the mm->vm_next areas to the core file.
  	 */
! 
  	/* alloc memory for large data structures: too large to be on stack */
  	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
  	if (!elf)
***************
*** 2383,2388 ****
--- 2759,2880 ----
  
  #endif		/* CONFIG_ELF_CORE */
  
+ #ifdef CONFIG_MINISEC_MPROTECT
+ /* PaX: non-PIC ELF libraries need relocations on their executable segments
+  * therefore we'll grant them VM_MAYWRITE once during their life. Similarly
+  * we'll remove VM_MAYWRITE for good on RELRO segments.
+  *
+  * The checks favour ld-linux.so behaviour which operates on a per ELF segment
+  * basis because we want to allow the common case and not the special ones.
+  */
+ static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags)
+ {
+ 	struct elfhdr elf_h;
+ 	struct elf_phdr elf_p;
+ 	unsigned long i;
+ 	unsigned long oldflags;
+ 	bool is_textrel_rw, is_textrel_rx, is_relro;
+ 
+ 	if (!(vma->vm_mm->pax_flags & MF_PAX_MPROTECT) || !vma->vm_file)
+ 		return;
+ 
+ 	oldflags = vma->vm_flags & (VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ);
+ 	newflags &= VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ;
+ 
+ 	/* possible RELRO */
+ 	is_relro = vma->anon_vma && oldflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ) && newflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ);
+ 
+ 	if (!is_textrel_rw && !is_textrel_rx && !is_relro)
+ 		return;
+ 
+ 	if (sizeof(elf_h) != kernel_read(vma->vm_file, 0UL, (char *)&elf_h, sizeof(elf_h)) ||
+ 	    memcmp(elf_h.e_ident, ELFMAG, SELFMAG) ||
+ 
+ 	    (is_relro && (elf_h.e_type != ET_DYN && elf_h.e_type != ET_EXEC)) ||
+ 	    !elf_check_arch(&elf_h) ||
+ 	    elf_h.e_phentsize != sizeof(struct elf_phdr) ||
+ 	    elf_h.e_phnum > 65536UL / sizeof(struct elf_phdr))
+ 		return;
+ 
+ 	for (i = 0UL; i < elf_h.e_phnum; i++) {
+ 		if (sizeof(elf_p) != kernel_read(vma->vm_file, elf_h.e_phoff + i*sizeof(elf_p), (char *)&elf_p, sizeof(elf_p)))
+ 			return;
+ 		switch (elf_p.p_type) {
+ 		case PT_DYNAMIC:
+ 			if (!is_textrel_rw && !is_textrel_rx)
+ 				continue;
+ 			i = 0UL;
+ 			while ((i+1) * sizeof(elf_dyn) <= elf_p.p_filesz) {
+ 				elf_dyn dyn;
+ 
+ 				if (sizeof(dyn) != kernel_read(vma->vm_file, elf_p.p_offset + i*sizeof(dyn), (char *)&dyn, sizeof(dyn)))
+ 					break;
+ 				if (dyn.d_tag == DT_NULL)
+ 					break;
+ 				if (dyn.d_tag == DT_TEXTREL || (dyn.d_tag == DT_FLAGS && (dyn.d_un.d_val & DF_TEXTREL))) {
+ 					if (is_textrel_rw)
+ 						vma->vm_flags |= VM_MAYWRITE;
+ 					else
+ 						/* PaX: disallow write access after relocs are done, hopefully noone else needs it... */
+ 						vma->vm_flags &= ~VM_MAYWRITE;
+ 					break;
+ 				}
+ 				i++;
+ 			}
+ 			is_textrel_rw = false;
+ 			is_textrel_rx = false;
+ 			continue;
+ 
+ 		case PT_GNU_RELRO:
+ 			if (!is_relro)
+ 				continue;
+ 			if ((elf_p.p_offset >> PAGE_SHIFT) == vma->vm_pgoff && ELF_PAGEALIGN(elf_p.p_memsz) == vma->vm_end - vma->vm_start)
+ 				vma->vm_flags &= ~VM_MAYWRITE;
+ 			is_relro = false;
+ 			continue;
+ 
+ #ifdef CONFIG_MINISEC_PT_PAX_FLAGS
+ 		case PT_PAX_FLAGS: {
+ 			const char *msg_mprotect = "", *msg_emutramp = "";
+ 			char *buffer_lib, *buffer_exe;
+ 
+ 			if (elf_p.p_flags & PF_NOMPROTECT)
+ 				msg_mprotect = "MPROTECT disabled";
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 			if (!(vma->vm_mm->pax_flags & MF_PAX_EMUTRAMP) && !(elf_p.p_flags & PF_NOEMUTRAMP))
+ 				msg_emutramp = "EMUTRAMP enabled";
+ #endif
+ 
+ 			if (!msg_mprotect[0] && !msg_emutramp[0])
+ 				continue;
+ 
+ 			if (!printk_ratelimit())
+ 				continue;
+ 
+ 			buffer_lib = (char *)__get_free_page(GFP_KERNEL);
+ 			buffer_exe = (char *)__get_free_page(GFP_KERNEL);
+ 			if (buffer_lib && buffer_exe) {
+ 				char *path_lib, *path_exe;
+ 
+ 				path_lib = pax_get_path(&vma->vm_file->f_path, buffer_lib, PAGE_SIZE);
+ 				path_exe = pax_get_path(&vma->vm_mm->exe_file->f_path, buffer_exe, PAGE_SIZE);
+ 
+ 				pr_info("PAX: %s wants %s%s%s on %s\n", path_lib, msg_mprotect,
+ 					(msg_mprotect[0] && msg_emutramp[0] ? " and " : ""), msg_emutramp, path_exe);
+ 
+ 			}
+ 			free_page((unsigned long)buffer_exe);
+ 			free_page((unsigned long)buffer_lib);
+ 			continue;
+ 		}
+ #endif
+ 
+ 		}
+ 	}
+ }
+ #endif
+ 
  static int __init init_elf_binfmt(void)
  {
  	register_binfmt(&elf_format);
diff --color -rcNP Master/fs/binfmt_elf.c.orig OG/fs/binfmt_elf.c.orig
*** Master/fs/binfmt_elf.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/binfmt_elf.c.orig	2021-04-20 15:10:45.378000000 -0400
***************
*** 0 ****
--- 1,2892 ----
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  * linux/fs/binfmt_elf.c
+  *
+  * These are the functions used to load ELF format executables as used
+  * on SVr4 machines.  Information on the format may be found in the book
+  * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support
+  * Tools".
+  *
+  * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
+  */
+ 
+ #include <linux/module.h>
+ #include <linux/kernel.h>
+ #include <linux/fs.h>
+ #include <linux/mm.h>
+ #include <linux/mman.h>
+ #include <linux/errno.h>
+ #include <linux/signal.h>
+ #include <linux/binfmts.h>
+ #include <linux/string.h>
+ #include <linux/file.h>
+ #include <linux/slab.h>
+ #include <linux/personality.h>
+ #include <linux/elfcore.h>
+ #include <linux/init.h>
+ #include <linux/highuid.h>
+ #include <linux/compiler.h>
+ #include <linux/highmem.h>
+ #include <linux/pagemap.h>
+ #include <linux/vmalloc.h>
+ #include <linux/security.h>
+ #include <linux/random.h>
+ #include <linux/elf.h>
+ #include <linux/elf-randomize.h>
+ #include <linux/utsname.h>
+ #include <linux/coredump.h>
+ #include <linux/sched.h>
+ #include <linux/sched/coredump.h>
+ #include <linux/sched/task_stack.h>
+ #include <linux/sched/cputime.h>
+ #include <linux/cred.h>
+ #include <linux/dax.h>
+ #include <linux/uaccess.h>
+ #include <linux/xattr.h>
+ #include <asm/param.h>
+ #include <asm/page.h>
+ 
+ #ifndef user_long_t
+ #define user_long_t long
+ #endif
+ #ifndef user_siginfo_t
+ #define user_siginfo_t siginfo_t
+ #endif
+ 
+ /* That's for binfmt_elf_fdpic to deal with */
+ #ifndef elf_check_fdpic
+ #define elf_check_fdpic(ex) false
+ #endif
+ 
+ static int load_elf_binary(struct linux_binprm *bprm);
+ 
+ #ifdef CONFIG_USELIB
+ static int load_elf_library(struct file *);
+ #else
+ #define load_elf_library NULL
+ #endif
+ 
+ /*
+  * If we don't support core dumping, then supply a NULL so we
+  * don't even try.
+  */
+ #ifdef CONFIG_ELF_CORE
+ static int elf_core_dump(struct coredump_params *cprm);
+ #else
+ #define elf_core_dump	NULL
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags);
+ #endif
+ 
+ #if ELF_EXEC_PAGESIZE > PAGE_SIZE
+ #define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
+ #else
+ #define ELF_MIN_ALIGN	PAGE_SIZE
+ #endif
+ 
+ #ifndef ELF_CORE_EFLAGS
+ #define ELF_CORE_EFLAGS	0
+ #endif
+ 
+ #define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+ #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
+ #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
+ 
+ static struct linux_binfmt elf_format = {
+ 	.module		= THIS_MODULE,
+ 	.load_binary	= load_elf_binary,
+ 	.load_shlib	= load_elf_library,
+ 	.core_dump	= elf_core_dump,
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	.handle_mprotect= elf_handle_mprotect,
+ #endif
+ 	.min_coredump	= ELF_EXEC_PAGESIZE,
+ };
+ 
+ #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
+ 
+ static int set_brk(unsigned long start, unsigned long end, int prot)
+ {
+ 	start = ELF_PAGEALIGN(start);
+ 	end = ELF_PAGEALIGN(end);
+ 	if (end > start) {
+ 		/*
+ 		 * Map the last of the bss segment.
+ 		 * If the header is requesting these pages to be
+ 		 * executable, honour that (ppc32 needs this).
+ 		 */
+ 		int error = vm_brk_flags(start, end - start,
+ 				prot & PROT_EXEC ? VM_EXEC : 0);
+ 		if (error)
+ 			return error;
+ 	}
+ 	current->mm->start_brk = current->mm->brk = end;
+ 	return 0;
+ }
+ 
+ /* We need to explicitly zero any fractional pages
+    after the data section (i.e. bss).  This would
+    contain the junk from the file that should not
+    be in memory
+  */
+ static int padzero(unsigned long elf_bss)
+ {
+ 	unsigned long nbyte;
+ 
+ 	nbyte = ELF_PAGEOFFSET(elf_bss);
+ 	if (nbyte) {
+ 		nbyte = ELF_MIN_ALIGN - nbyte;
+ 		if (clear_user((void __user *) elf_bss, nbyte))
+ 			return -EFAULT;
+ 	}
+ 	return 0;
+ }
+ 
+ /* Let's use some macros to make this stack manipulation a little clearer */
+ #ifdef CONFIG_STACK_GROWSUP
+ #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items))
+ #define STACK_ROUND(sp, items) \
+ 	((15 + (unsigned long) ((sp) + (items))) &~ 15UL)
+ #define STACK_ALLOC(sp, len) ({ \
+ 	elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \
+ 	old_sp; })
+ #else
+ #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
+ #define STACK_ROUND(sp, items) \
+ 	(((unsigned long) (sp - items)) &~ 15UL)
+ #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
+ #endif
+ 
+ #ifndef ELF_BASE_PLATFORM
+ /*
+  * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+  * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+  * will be copied to the user stack in the same manner as AT_PLATFORM.
+  */
+ #define ELF_BASE_PLATFORM NULL
+ #endif
+ 
+ static int
+ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
+ 		unsigned long load_addr, unsigned long interp_load_addr)
+ {
+ 	unsigned long p = bprm->p;
+ 	int argc = bprm->argc;
+ 	int envc = bprm->envc;
+ 	elf_addr_t __user *sp;
+ 	elf_addr_t __user *u_platform;
+ 	elf_addr_t __user *u_base_platform;
+ 	elf_addr_t __user *u_rand_bytes;
+ 	const char *k_platform = ELF_PLATFORM;
+ 	const char *k_base_platform = ELF_BASE_PLATFORM;
+ 	unsigned char k_rand_bytes[16];
+ 	int items;
+ 	elf_addr_t *elf_info;
+ 	int ei_index = 0;
+ 	const struct cred *cred = current_cred();
+ 	struct vm_area_struct *vma;
+ 
+ 	/*
+ 	 * In some cases (e.g. Hyper-Threading), we want to avoid L1
+ 	 * evictions by the processes running on the same package. One
+ 	 * thing we can do is to shuffle the initial stack for them.
+ 	 */
+ 
+ 	p = arch_align_stack(p);
+ 
+ 	/*
+ 	 * If this architecture has a platform capability string, copy it
+ 	 * to userspace.  In some cases (Sparc), this info is impossible
+ 	 * for userspace to get any other way, in others (i386) it is
+ 	 * merely difficult.
+ 	 */
+ 	u_platform = NULL;
+ 	if (k_platform) {
+ 		size_t len = strlen(k_platform) + 1;
+ 
+ 		u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+ 		if (__copy_to_user(u_platform, k_platform, len))
+ 			return -EFAULT;
+ 	}
+ 
+ 	/*
+ 	 * If this architecture has a "base" platform capability
+ 	 * string, copy it to userspace.
+ 	 */
+ 	u_base_platform = NULL;
+ 	if (k_base_platform) {
+ 		size_t len = strlen(k_base_platform) + 1;
+ 
+ 		u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+ 		if (__copy_to_user(u_base_platform, k_base_platform, len))
+ 			return -EFAULT;
+ 	}
+ 
+ 	/*
+ 	 * Generate 16 random bytes for userspace PRNG seeding.
+ 	 */
+ 	get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+ 	u_rand_bytes = (elf_addr_t __user *)
+ 		       STACK_ALLOC(p, sizeof(k_rand_bytes));
+ 	if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+ 		return -EFAULT;
+ 
+ 	/* Create the ELF interpreter info */
+ 	elf_info = (elf_addr_t *)current->mm->saved_auxv;
+ 	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
+ #define NEW_AUX_ENT(id, val) \
+ 	do { \
+ 		elf_info[ei_index++] = id; \
+ 		elf_info[ei_index++] = val; \
+ 	} while (0)
+ 
+ #ifdef ARCH_DLINFO
+ 	/* 
+ 	 * ARCH_DLINFO must come first so PPC can do its special alignment of
+ 	 * AUXV.
+ 	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
+ 	 * ARCH_DLINFO changes
+ 	 */
+ 	ARCH_DLINFO;
+ #endif
+ 	NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+ 	NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
+ 	NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
+ 	NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ 	NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
+ 	NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
+ 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
+ 	NEW_AUX_ENT(AT_FLAGS, 0);
+ 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
+ 	NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid));
+ 	NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
+ 	NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
+ 	NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
+ 	NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
+ 	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
+ #ifdef ELF_HWCAP2
+ 	NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+ #endif
+ 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
+ 	if (k_platform) {
+ 		NEW_AUX_ENT(AT_PLATFORM,
+ 			    (elf_addr_t)(unsigned long)u_platform);
+ 	}
+ 	if (k_base_platform) {
+ 		NEW_AUX_ENT(AT_BASE_PLATFORM,
+ 			    (elf_addr_t)(unsigned long)u_base_platform);
+ 	}
+ 	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+ 		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+ 	}
+ #undef NEW_AUX_ENT
+ 	/* AT_NULL is zero; clear the rest too */
+ 	memset(&elf_info[ei_index], 0,
+ 	       sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]);
+ 
+ 	/* And advance past the AT_NULL entry.  */
+ 	ei_index += 2;
+ 
+ 	sp = STACK_ADD(p, ei_index);
+ 
+ 	items = (argc + 1) + (envc + 1) + 1;
+ 	bprm->p = STACK_ROUND(sp, items);
+ 
+ 	/* Point sp at the lowest address on the stack */
+ #ifdef CONFIG_STACK_GROWSUP
+ 	sp = (elf_addr_t __user *)bprm->p - items - ei_index;
+ 	bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */
+ #else
+ 	sp = (elf_addr_t __user *)bprm->p;
+ #endif
+ 
+ 
+ 	/*
+ 	 * Grow the stack manually; some architectures have a limit on how
+ 	 * far ahead a user-space access may be in order to grow the stack.
+ 	 */
+ 	vma = find_extend_vma(current->mm, bprm->p);
+ 	if (!vma)
+ 		return -EFAULT;
+ 
+ 	/* Now, let's put argc (and argv, envp if appropriate) on the stack */
+ 	if (__put_user(argc, sp++))
+ 		return -EFAULT;
+ 
+ 	/* Populate list of argv pointers back to argv strings. */
+ 	p = current->mm->arg_end = current->mm->arg_start;
+ 	while (argc-- > 0) {
+ 		size_t len;
+ 		if (__put_user((elf_addr_t)p, sp++))
+ 			return -EFAULT;
+ 		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+ 		if (!len || len > MAX_ARG_STRLEN)
+ 			return -EINVAL;
+ 		p += len;
+ 	}
+ 	if (__put_user(0, sp++))
+ 		return -EFAULT;
+ 	current->mm->arg_end = p;
+ 
+ 	/* Populate list of envp pointers back to envp strings. */
+ 	current->mm->env_end = current->mm->env_start = p;
+ 	while (envc-- > 0) {
+ 		size_t len;
+ 		if (__put_user((elf_addr_t)p, sp++))
+ 			return -EFAULT;
+ 		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
+ 		if (!len || len > MAX_ARG_STRLEN)
+ 			return -EINVAL;
+ 		p += len;
+ 	}
+ 	if (__put_user(0, sp++))
+ 		return -EFAULT;
+ 	current->mm->env_end = p;
+ 
+ 	/* Put the elf_info on the stack in the right place.  */
+ 	if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
+ 		return -EFAULT;
+ 	return 0;
+ }
+ 
+ #ifndef elf_map
+ 
+ static unsigned long elf_map(struct file *filep, unsigned long addr,
+ 		const struct elf_phdr *eppnt, int prot, int type,
+ 		unsigned long total_size)
+ {
+ 	unsigned long map_addr;
+ 	unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
+ 	unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
+ 	addr = ELF_PAGESTART(addr);
+ 	size = ELF_PAGEALIGN(size);
+ 
+ 	/* mmap() will return -EINVAL if given a zero size, but a
+ 	 * segment with zero filesize is perfectly valid */
+ 	if (!size)
+ 		return addr;
+ 
+ 	/*
+ 	* total_size is the size of the ELF (interpreter) image.
+ 	* The _first_ mmap needs to know the full size, otherwise
+ 	* randomization might put this image into an overlapping
+ 	* position with the ELF binary image. (since size < total_size)
+ 	* So we first map the 'big' image - and unmap the remainder at
+ 	* the end. (which unmap is needed for ELF images with holes.)
+ 	*/
+ 	if (total_size) {
+ 		total_size = ELF_PAGEALIGN(total_size);
+ 		map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
+ 		if (!BAD_ADDR(map_addr))
+ 			vm_munmap(map_addr+size, total_size-size);
+ 	} else
+ 		map_addr = vm_mmap(filep, addr, size, prot, type, off);
+ 
+ 	if ((type & MAP_FIXED_NOREPLACE) &&
+ 	    PTR_ERR((void *)map_addr) == -EEXIST)
+ 		pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n",
+ 			task_pid_nr(current), current->comm, (void *)addr);
+ 
+ 	return(map_addr);
+ }
+ 
+ #endif /* !elf_map */
+ 
+ static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+ {
+ 	int i, first_idx = -1, last_idx = -1;
+ 
+ 	for (i = 0; i < nr; i++) {
+ 		if (cmds[i].p_type == PT_LOAD) {
+ 			last_idx = i;
+ 			if (first_idx == -1)
+ 				first_idx = i;
+ 		}
+ 	}
+ 	if (first_idx == -1)
+ 		return 0;
+ 
+ 	return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
+ 				ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ }
+ 
+ /**
+  * load_elf_phdrs() - load ELF program headers
+  * @elf_ex:   ELF header of the binary whose program headers should be loaded
+  * @elf_file: the opened ELF binary file
+  *
+  * Loads ELF program headers from the binary file elf_file, which has the ELF
+  * header pointed to by elf_ex, into a newly allocated array. The caller is
+  * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+  */
+ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
+ 				       struct file *elf_file)
+ {
+ 	struct elf_phdr *elf_phdata = NULL;
+ 	int retval, err = -1;
+ 	loff_t pos = elf_ex->e_phoff;
+ 	unsigned int size;
+ 
+ 	/*
+ 	 * If the size of this structure has changed, then punt, since
+ 	 * we will be doing the wrong thing.
+ 	 */
+ 	if (elf_ex->e_phentsize != sizeof(struct elf_phdr))
+ 		goto out;
+ 
+ 	/* Sanity check the number of program headers... */
+ 	/* ...and their total size. */
+ 	size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
+ 	if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
+ 		goto out;
+ 
+ 	elf_phdata = kmalloc(size, GFP_KERNEL);
+ 	if (!elf_phdata)
+ 		goto out;
+ 
+ 	/* Read in the program headers */
+ 	retval = kernel_read(elf_file, elf_phdata, size, &pos);
+ 	if (retval != size) {
+ 		err = (retval < 0) ? retval : -EIO;
+ 		goto out;
+ 	}
+ 
+ 	/* Success! */
+ 	err = 0;
+ out:
+ 	if (err) {
+ 		kfree(elf_phdata);
+ 		elf_phdata = NULL;
+ 	}
+ 	return elf_phdata;
+ }
+ 
+ #ifndef CONFIG_ARCH_BINFMT_ELF_STATE
+ 
+ /**
+  * struct arch_elf_state - arch-specific ELF loading state
+  *
+  * This structure is used to preserve architecture specific data during
+  * the loading of an ELF file, throughout the checking of architecture
+  * specific ELF headers & through to the point where the ELF load is
+  * known to be proceeding (ie. SET_PERSONALITY).
+  *
+  * This implementation is a dummy for architectures which require no
+  * specific state.
+  */
+ struct arch_elf_state {
+ };
+ 
+ #define INIT_ARCH_ELF_STATE {}
+ 
+ /**
+  * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header
+  * @ehdr:	The main ELF header
+  * @phdr:	The program header to check
+  * @elf:	The open ELF file
+  * @is_interp:	True if the phdr is from the interpreter of the ELF being
+  *		loaded, else false.
+  * @state:	Architecture-specific state preserved throughout the process
+  *		of loading the ELF.
+  *
+  * Inspects the program header phdr to validate its correctness and/or
+  * suitability for the system. Called once per ELF program header in the
+  * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its
+  * interpreter.
+  *
+  * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+  *         with that return code.
+  */
+ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
+ 				   struct elf_phdr *phdr,
+ 				   struct file *elf, bool is_interp,
+ 				   struct arch_elf_state *state)
+ {
+ 	/* Dummy implementation, always proceed */
+ 	return 0;
+ }
+ 
+ /**
+  * arch_check_elf() - check an ELF executable
+  * @ehdr:	The main ELF header
+  * @has_interp:	True if the ELF has an interpreter, else false.
+  * @interp_ehdr: The interpreter's ELF header
+  * @state:	Architecture-specific state preserved throughout the process
+  *		of loading the ELF.
+  *
+  * Provides a final opportunity for architecture code to reject the loading
+  * of the ELF & cause an exec syscall to return an error. This is called after
+  * all program headers to be checked by arch_elf_pt_proc have been.
+  *
+  * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load
+  *         with that return code.
+  */
+ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+ 				 struct elfhdr *interp_ehdr,
+ 				 struct arch_elf_state *state)
+ {
+ 	/* Dummy implementation, always proceed */
+ 	return 0;
+ }
+ 
+ #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
+ 
+ static inline int make_prot(u32 p_flags)
+ {
+ 	int prot = 0;
+ 
+ 	if (p_flags & PF_R)
+ 		prot |= PROT_READ;
+ 	if (p_flags & PF_W)
+ 		prot |= PROT_WRITE;
+ 	if (p_flags & PF_X)
+ 		prot |= PROT_EXEC;
+ 	return prot;
+ }
+ 
+ /* This is much more generalized than the library routine read function,
+    so we keep this separate.  Technically the library read function
+    is only provided so that we can read a.out libraries that have
+    an ELF header */
+ 
+ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
+ 		struct file *interpreter, unsigned long *interp_map_addr,
+ 		unsigned long no_base, struct elf_phdr *interp_elf_phdata)
+ {
+ 	struct elf_phdr *eppnt;
+ 	unsigned long load_addr = 0;
+ 	int load_addr_set = 0;
+ 	unsigned long last_bss = 0, elf_bss = 0;
+ 	int bss_prot = 0;
+ 	unsigned long error = ~0UL;
+ 	unsigned long total_size;
+ 	int i;
+ 
+ 	/* First of all, some simple consistency checks */
+ 	if (interp_elf_ex->e_type != ET_EXEC &&
+ 	    interp_elf_ex->e_type != ET_DYN)
+ 		goto out;
+ 	if (!elf_check_arch(interp_elf_ex) ||
+ 	    elf_check_fdpic(interp_elf_ex))
+ 		goto out;
+ 	if (!interpreter->f_op->mmap)
+ 		goto out;
+ 
+ 	total_size = total_mapping_size(interp_elf_phdata,
+ 					interp_elf_ex->e_phnum);
+ 	if (!total_size) {
+ 		error = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	eppnt = interp_elf_phdata;
+ 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+ 		if (eppnt->p_type == PT_LOAD) {
+ 			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+ 			int elf_prot = make_prot(eppnt->p_flags);
+ 			unsigned long vaddr = 0;
+ 			unsigned long k, map_addr;
+ 
+ 			vaddr = eppnt->p_vaddr;
+ 			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
+ 				elf_type |= MAP_FIXED_NOREPLACE;
+ 			else if (no_base && interp_elf_ex->e_type == ET_DYN)
+ 				load_addr = -vaddr;
+ 
+ 			map_addr = elf_map(interpreter, load_addr + vaddr,
+ 					eppnt, elf_prot, elf_type, total_size);
+ 			total_size = 0;
+ 			if (!*interp_map_addr)
+ 				*interp_map_addr = map_addr;
+ 			error = map_addr;
+ 			if (BAD_ADDR(map_addr))
+ 				goto out;
+ 
+ 			if (!load_addr_set &&
+ 			    interp_elf_ex->e_type == ET_DYN) {
+ 				load_addr = map_addr - ELF_PAGESTART(vaddr);
+ 				load_addr_set = 1;
+ 			}
+ 
+ 			/*
+ 			 * Check to see if the section's size will overflow the
+ 			 * allowed task size. Note that p_filesz must always be
+ 			 * <= p_memsize so it's only necessary to check p_memsz.
+ 			 */
+ 			k = load_addr + eppnt->p_vaddr;
+ 			if (BAD_ADDR(k) ||
+ 			    eppnt->p_filesz > eppnt->p_memsz ||
+ 			    eppnt->p_memsz > TASK_SIZE ||
+ 			    TASK_SIZE - eppnt->p_memsz < k) {
+ 				error = -ENOMEM;
+ 				goto out;
+ 			}
+ 
+ 			/*
+ 			 * Find the end of the file mapping for this phdr, and
+ 			 * keep track of the largest address we see for this.
+ 			 */
+ 			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
+ 			if (k > elf_bss)
+ 				elf_bss = k;
+ 
+ 			/*
+ 			 * Do the same thing for the memory mapping - between
+ 			 * elf_bss and last_bss is the bss section.
+ 			 */
+ 			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
+ 			if (k > last_bss) {
+ 				last_bss = k;
+ 				bss_prot = elf_prot;
+ 			}
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Now fill out the bss section: first pad the last page from
+ 	 * the file up to the page boundary, and zero it from elf_bss
+ 	 * up to the end of the page.
+ 	 */
+ 	if (padzero(elf_bss)) {
+ 		error = -EFAULT;
+ 		goto out;
+ 	}
+ 	/*
+ 	 * Next, align both the file and mem bss up to the page size,
+ 	 * since this is where elf_bss was just zeroed up to, and where
+ 	 * last_bss will end after the vm_brk_flags() below.
+ 	 */
+ 	elf_bss = ELF_PAGEALIGN(elf_bss);
+ 	last_bss = ELF_PAGEALIGN(last_bss);
+ 	/* Finally, if there is still more bss to allocate, do it. */
+ 	if (last_bss > elf_bss) {
+ 		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
+ 				bss_prot & PROT_EXEC ? VM_EXEC : 0);
+ 		if (error)
+ 			goto out;
+ 	}
+ 
+ 	error = load_addr;
+ out:
+ 	return error;
+ }
+ 
+ #ifdef CONFIG_MINISEC_PT_PAX_FLAGS
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ static unsigned long pax_parse_pt_pax_softmode(const struct elf_phdr * const elf_phdata)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (elf_phdata->p_flags & PF_PAGEEXEC)
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (elf_phdata->p_flags & PF_SEGMEXEC)
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if ((elf_phdata->p_flags & PF_EMUTRAMP) && (pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (elf_phdata->p_flags & PF_MPROTECT)
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && (elf_phdata->p_flags & PF_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ static unsigned long pax_parse_pt_pax_hardmode(const struct elf_phdr * const elf_phdata)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (!(elf_phdata->p_flags & PF_NOPAGEEXEC))
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (!(elf_phdata->p_flags & PF_NOSEGMEXEC))
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if (!(elf_phdata->p_flags & PF_NOEMUTRAMP))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (!(elf_phdata->p_flags & PF_NOMPROTECT))
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && !(elf_phdata->p_flags & PF_NORANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ static unsigned long pax_parse_xattr_pax_softmode(unsigned long pax_flags_softmode)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (pax_flags_softmode & MF_PAX_PAGEEXEC)
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (pax_flags_softmode & MF_PAX_SEGMEXEC)
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if ((pax_flags_softmode & MF_PAX_EMUTRAMP) && (pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (pax_flags_softmode & MF_PAX_MPROTECT)
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && (pax_flags_softmode & MF_PAX_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ static unsigned long pax_parse_xattr_pax_hardmode(unsigned long pax_flags_hardmode)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (!(pax_flags_hardmode & MF_PAX_PAGEEXEC))
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (!(pax_flags_hardmode & MF_PAX_SEGMEXEC))
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if (!(pax_flags_hardmode & MF_PAX_EMUTRAMP))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (!(pax_flags_hardmode & MF_PAX_MPROTECT))
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_RANDMMAP) || defined(CONFIG_MINISEC_RANDUSTACK)
+ 	if (randomize_va_space && !(pax_flags_hardmode & MF_PAX_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ static unsigned long pax_parse_defaults(void)
+ {
+ 	unsigned long pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	if (pax_softmode)
+ 		return pax_flags;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_RANDMMAP
+ 	if (randomize_va_space)
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ 	return pax_flags;
+ }
+ 
+ static unsigned long pax_parse_ei_pax(const struct elfhdr * const elf_ex)
+ {
+ 	unsigned long pax_flags = PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #ifdef CONFIG_MINISEC_EI_PAX
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	if (pax_softmode)
+ 		return pax_flags;
+ #endif
+ 
+ 	pax_flags = 0UL;
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_PAGEEXEC))
+ 		pax_flags |= MF_PAX_PAGEEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SEGMEXEC
+ 	if (!(elf_ex->e_ident[EI_PAX] & EF_PAX_SEGMEXEC))
+ 		pax_flags |= MF_PAX_SEGMEXEC;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && (elf_ex->e_ident[EI_PAX] & EF_PAX_EMUTRAMP))
+ 		pax_flags |= MF_PAX_EMUTRAMP;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) && !(elf_ex->e_ident[EI_PAX] & EF_PAX_MPROTECT))
+ 		pax_flags |= MF_PAX_MPROTECT;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	if (randomize_va_space && !(elf_ex->e_ident[EI_PAX] & EF_PAX_RANDMMAP))
+ 		pax_flags |= MF_PAX_RANDMMAP;
+ #endif
+ 
+ #endif
+ 
+ 	return pax_flags;
+ 
+ }
+ 
+ static unsigned long pax_parse_pt_pax(const struct elfhdr * const elf_ex, const struct elf_phdr * const elf_phdata)
+ {
+ 
+ #ifdef CONFIG_MINISEC_PT_PAX_FLAGS
+ 	unsigned long i;
+ 
+ 	for (i = 0UL; i < elf_ex->e_phnum; i++)
+ 		if (elf_phdata[i].p_type == PT_PAX_FLAGS) {
+ 			if (((elf_phdata[i].p_flags & PF_PAGEEXEC) && (elf_phdata[i].p_flags & PF_NOPAGEEXEC)) ||
+ 			    ((elf_phdata[i].p_flags & PF_SEGMEXEC) && (elf_phdata[i].p_flags & PF_NOSEGMEXEC)) ||
+ 			    ((elf_phdata[i].p_flags & PF_EMUTRAMP) && (elf_phdata[i].p_flags & PF_NOEMUTRAMP)) ||
+ 			    ((elf_phdata[i].p_flags & PF_MPROTECT) && (elf_phdata[i].p_flags & PF_NOMPROTECT)) ||
+ 			    ((elf_phdata[i].p_flags & PF_RANDMMAP) && (elf_phdata[i].p_flags & PF_NORANDMMAP)))
+ 				return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 			if (pax_softmode)
+ 				return pax_parse_pt_pax_softmode(&elf_phdata[i]);
+ 			else
+ #endif
+ 
+ 				return pax_parse_pt_pax_hardmode(&elf_phdata[i]);
+ 			break;
+ 		}
+ #endif
+ 
+ 	return PAX_PARSE_FLAGS_FALLBACK;
+ }
+ 
+ static unsigned long pax_parse_xattr_pax(struct file * const file)
+ {
+ 
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ 	ssize_t xattr_size, i;
+ 	unsigned char xattr_value[sizeof("pemrs") - 1];
+ 	unsigned long pax_flags_hardmode = 0UL, pax_flags_softmode = 0UL;
+ 
+ 	xattr_size = pax_getxattr(file->f_path.dentry, xattr_value, sizeof xattr_value);
+ 	if (xattr_size < 0 || xattr_size > sizeof xattr_value)
+ 		return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ 	for (i = 0; i < xattr_size; i++)
+ 		switch (xattr_value[i]) {
+ 		default:
+ 			return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #define parse_flag(option1, option2, flag)			\
+ 		case option1:					\
+ 			if (pax_flags_hardmode & MF_PAX_##flag)	\
+ 				return PAX_PARSE_FLAGS_FALLBACK;\
+ 			pax_flags_hardmode |= MF_PAX_##flag;	\
+ 			break;					\
+ 		case option2:					\
+ 			if (pax_flags_softmode & MF_PAX_##flag)	\
+ 				return PAX_PARSE_FLAGS_FALLBACK;\
+ 			pax_flags_softmode |= MF_PAX_##flag;	\
+ 			break;
+ 
+ 		parse_flag('p', 'P', PAGEEXEC);
+ 		parse_flag('e', 'E', EMUTRAMP);
+ 		parse_flag('m', 'M', MPROTECT);
+ 		parse_flag('r', 'R', RANDMMAP);
+ 		parse_flag('s', 'S', SEGMEXEC);
+ 
+ #undef parse_flag
+ 		}
+ 
+ 	if (pax_flags_hardmode & pax_flags_softmode)
+ 		return PAX_PARSE_FLAGS_FALLBACK;
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	if (pax_softmode)
+ 		return pax_parse_xattr_pax_softmode(pax_flags_softmode);
+ 	else
+ #endif
+ 
+ 		return pax_parse_xattr_pax_hardmode(pax_flags_hardmode);
+ #else
+ 	return PAX_PARSE_FLAGS_FALLBACK;
+ #endif
+ 
+ }
+ 
+ static long pax_parse_pax_flags(const struct elfhdr * const elf_ex, const struct elf_phdr * const elf_phdata, struct file * const file)
+ {
+ 	unsigned long pax_flags, ei_pax_flags,  pt_pax_flags, xattr_pax_flags;
+ 
+ 	pax_flags = pax_parse_defaults();
+ 	ei_pax_flags = pax_parse_ei_pax(elf_ex);
+ 	pt_pax_flags = pax_parse_pt_pax(elf_ex, elf_phdata);
+ 	xattr_pax_flags = pax_parse_xattr_pax(file);
+ 
+ 	if (pt_pax_flags != PAX_PARSE_FLAGS_FALLBACK &&
+ 	    xattr_pax_flags != PAX_PARSE_FLAGS_FALLBACK &&
+ 	    pt_pax_flags != xattr_pax_flags)
+ 		return -EINVAL;
+ 	if (xattr_pax_flags != PAX_PARSE_FLAGS_FALLBACK)
+ 		pax_flags = xattr_pax_flags;
+ 	else if (pt_pax_flags != PAX_PARSE_FLAGS_FALLBACK)
+ 		pax_flags = pt_pax_flags;
+ 	else if (ei_pax_flags != PAX_PARSE_FLAGS_FALLBACK)
+ 		pax_flags = ei_pax_flags;
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_MINISEC_SEGMEXEC)
+ 	if ((pax_flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) == (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC)) {
+ 		if ((__supported_pte_mask & _PAGE_NX))
+ 			pax_flags &= ~MF_PAX_SEGMEXEC;
+ 		else
+ 			pax_flags &= ~MF_PAX_PAGEEXEC;
+ 	}
+ #endif
+ 
+ 	if (0 > pax_check_flags(&pax_flags))
+ 		return -EINVAL;
+ 
+ 	current->mm->pax_flags = pax_flags;
+ 	return 0;
+ }
+ #endif
+ 
+ /*
+  * These are the functions used to load ELF style executables and shared
+  * libraries.  There is no binary dependent code anywhere else.
+  */
+ 
+ static int load_elf_binary(struct linux_binprm *bprm)
+ {
+ 	struct file *interpreter = NULL; /* to shut gcc up */
+  	unsigned long load_addr = 0, load_bias = 0;
+ 	int load_addr_set = 0;
+ 	unsigned long error;
+ 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
+ 	unsigned long elf_bss, elf_brk;
+ 	int bss_prot = 0;
+ 	int retval, i;
+ 	unsigned long elf_entry;
+ 	unsigned long interp_load_addr = 0;
+ 	unsigned long start_code, end_code, start_data, end_data;
+ 	unsigned long reloc_func_desc __maybe_unused = 0;
+ 	int executable_stack = EXSTACK_DEFAULT;
+ 	struct {
+ 		struct elfhdr elf_ex;
+ 		struct elfhdr interp_elf_ex;
+ 	} *loc;
+ 	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
+ 	struct pt_regs *regs;
+ 
+ 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ 	if (!loc) {
+ 		retval = -ENOMEM;
+ 		goto out_ret;
+ 	}
+ 	
+ 	/* Get the exec-header */
+ 	loc->elf_ex = *((struct elfhdr *)bprm->buf);
+ 
+ 	retval = -ENOEXEC;
+ 	/* First of all, some simple consistency checks */
+ 	if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ 		goto out;
+ 
+ 	if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+ 		goto out;
+ 	if (!elf_check_arch(&loc->elf_ex))
+ 		goto out;
+ 	if (elf_check_fdpic(&loc->elf_ex))
+ 		goto out;
+ 	if (!bprm->file->f_op->mmap)
+ 		goto out;
+ 
+ 	elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file);
+ 	if (!elf_phdata)
+ 		goto out;
+ 
+ 	elf_ppnt = elf_phdata;
+ 	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+ 		char *elf_interpreter;
+ 		loff_t pos;
+ 
+ 		if (elf_ppnt->p_type != PT_INTERP)
+ 			continue;
+ 
+ 		/*
+ 		 * This is the program interpreter used for shared libraries -
+ 		 * for now assume that this is an a.out format binary.
+ 		 */
+ 		retval = -ENOEXEC;
+ 		if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
+ 			goto out_free_ph;
+ 
+ 		retval = -ENOMEM;
+ 		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
+ 		if (!elf_interpreter)
+ 			goto out_free_ph;
+ 
+ 		pos = elf_ppnt->p_offset;
+ 		retval = kernel_read(bprm->file, elf_interpreter,
+ 				     elf_ppnt->p_filesz, &pos);
+ 		if (retval != elf_ppnt->p_filesz) {
+ 			if (retval >= 0)
+ 				retval = -EIO;
+ 			goto out_free_interp;
+ 		}
+ 		/* make sure path is NULL terminated */
+ 		retval = -ENOEXEC;
+ 		if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
+ 			goto out_free_interp;
+ 
+ 		interpreter = open_exec(elf_interpreter);
+ 		kfree(elf_interpreter);
+ 		retval = PTR_ERR(interpreter);
+ 		if (IS_ERR(interpreter))
+ 			goto out_free_ph;
+ 
+ 		/*
+ 		 * If the binary is not readable then enforce mm->dumpable = 0
+ 		 * regardless of the interpreter's permissions.
+ 		 */
+ 		would_dump(bprm, interpreter);
+ 
+ 		/* Get the exec headers */
+ 		pos = 0;
+ 		retval = kernel_read(interpreter, &loc->interp_elf_ex,
+ 				     sizeof(loc->interp_elf_ex), &pos);
+ 		if (retval != sizeof(loc->interp_elf_ex)) {
+ 			if (retval >= 0)
+ 				retval = -EIO;
+ 			goto out_free_dentry;
+ 		}
+ 
+ 		break;
+ 
+ out_free_interp:
+ 		kfree(elf_interpreter);
+ 		goto out_free_ph;
+ 	}
+ 
+ 	elf_ppnt = elf_phdata;
+ 	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
+ 		switch (elf_ppnt->p_type) {
+ 		case PT_GNU_STACK:
+ 			if (elf_ppnt->p_flags & PF_X)
+ 				executable_stack = EXSTACK_ENABLE_X;
+ 			else
+ 				executable_stack = EXSTACK_DISABLE_X;
+ 			break;
+ 
+ 		case PT_LOPROC ... PT_HIPROC:
+ 			retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt,
+ 						  bprm->file, false,
+ 						  &arch_state);
+ 			if (retval)
+ 				goto out_free_dentry;
+ 			break;
+ 		}
+ 
+ 	/* Some simple consistency checks for the interpreter */
+ 	if (interpreter) {
+ 		retval = -ELIBBAD;
+ 		/* Not an ELF interpreter */
+ 		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ 			goto out_free_dentry;
+ 		/* Verify the interpreter has a valid arch */
+ 		if (!elf_check_arch(&loc->interp_elf_ex) ||
+ 		    elf_check_fdpic(&loc->interp_elf_ex))
+ 			goto out_free_dentry;
+ 
+ 		/* Load the interpreter program headers */
+ 		interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex,
+ 						   interpreter);
+ 		if (!interp_elf_phdata)
+ 			goto out_free_dentry;
+ 
+ 		/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
+ 		elf_ppnt = interp_elf_phdata;
+ 		for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++)
+ 			switch (elf_ppnt->p_type) {
+ 			case PT_LOPROC ... PT_HIPROC:
+ 				retval = arch_elf_pt_proc(&loc->interp_elf_ex,
+ 							  elf_ppnt, interpreter,
+ 							  true, &arch_state);
+ 				if (retval)
+ 					goto out_free_dentry;
+ 				break;
+ 			}
+ 	}
+ 
+ 	/*
+ 	 * Allow arch code to reject the ELF at this point, whilst it's
+ 	 * still possible to return an error to the code that invoked
+ 	 * the exec syscall.
+ 	 */
+ 	retval = arch_check_elf(&loc->elf_ex,
+ 				!!interpreter, &loc->interp_elf_ex,
+ 				&arch_state);
+ 	if (retval)
+ 		goto out_free_dentry;
+ 
+ 	/* Flush all traces of the currently running executable */
+ 	retval = flush_old_exec(bprm);
+ 	if (retval)
+ 		goto out_free_dentry;
+ 
+ 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
+ 	   may depend on the personality.  */
+ 	SET_PERSONALITY2(loc->elf_ex, &arch_state);
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ 	current->mm->pax_flags = 0UL;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(X86_32)
+ 	if ((current->mm->pax_flags & MF_PAX_PAGEEXEC) && !(__supported_pte_mask & _PAGE_NX)) {
+ 		current->mm->context.user_cs_limit = PAGE_SIZE;
+ 		current->mm->def_flags |= VM_PAGEEXEC | VM_NOHUGEPAGE;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	current->mm->delta_mmap = 0UL;
+ 	current->mm->delta_stack = 0UL;
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ 	if (0 > pax_parse_pax_flags(elf_ex, elf_phdata, bprm->file)) {
+ 		retval = -EINVAL;
+ 		goto out_free_dentry;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	if (current->mm->pax_flags & MF_PAX_RANDMMAP) {
+ 		current->mm->delta_mmap = (pax_get_random_long() & ((1UL << PAX_DELTA_MMAP_LEN)-1)) << PAGE_SHIFT;
+ 		current->mm->delta_stack = (pax_get_random_long() & ((1UL << PAX_DELTA_STACK_LEN)-1)) << PAGE_SHIFT;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	if (current->mm->pax_flags & MF_PAX_PAGEEXEC) {
+ 		executable_stack = EXSTACK_DISABLE_X;
+ 		current->personality &= ~READ_IMPLIES_EXEC;
+ 	} else
+ #endif
+ 
+ 	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
+ 		current->personality |= READ_IMPLIES_EXEC;
+ 
+ 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ 		current->flags |= PF_RANDOMIZE;
+ 
+ 	setup_new_exec(bprm);
+ 	install_exec_creds(bprm);
+ 
+ 	/* Do this so that we can load the interpreter, if need be.  We will
+ 	   change some of these later */
+ 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
+ 				 executable_stack);
+ 	if (retval < 0)
+ 		goto out_free_dentry;
+ 	
+ 	elf_bss = 0;
+ 	elf_brk = 0;
+ 
+ 	start_code = ~0UL;
+ 	end_code = 0;
+ 	start_data = 0;
+ 	end_data = 0;
+ 
+ 	/* Now we do a little grungy work by mmapping the ELF image into
+ 	   the correct location in memory. */
+ 	for(i = 0, elf_ppnt = elf_phdata;
+ 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+ 		int elf_prot, elf_flags;
+ 		unsigned long k, vaddr;
+ 		unsigned long total_size = 0;
+ 
+ 		if (elf_ppnt->p_type != PT_LOAD)
+ 			continue;
+ 
+ 		if (unlikely (elf_brk > elf_bss)) {
+ 			unsigned long nbyte;
+ 	            
+ 			/* There was a PT_LOAD segment with p_memsz > p_filesz
+ 			   before this one. Map anonymous pages, if needed,
+ 			   and clear the area.  */
+ 			retval = set_brk(elf_bss + load_bias,
+ 					 elf_brk + load_bias,
+ 					 bss_prot);
+ 			if (retval)
+ 				goto out_free_dentry;
+ 			nbyte = ELF_PAGEOFFSET(elf_bss);
+ 			if (nbyte) {
+ 				nbyte = ELF_MIN_ALIGN - nbyte;
+ 				if (nbyte > elf_brk - elf_bss)
+ 					nbyte = elf_brk - elf_bss;
+ 				if (clear_user((void __user *)elf_bss +
+ 							load_bias, nbyte)) {
+ 					/*
+ 					 * This bss-zeroing can fail if the ELF
+ 					 * file specifies odd protections. So
+ 					 * we don't check the return value
+ 					 */
+ 				}
+ 			}
+ 		}
+ 
+ 		elf_prot = make_prot(elf_ppnt->p_flags);
+ 
+ 		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+ 
+ 		vaddr = elf_ppnt->p_vaddr;
+ 		/*
+ 		 * If we are loading ET_EXEC or we have already performed
+ 		 * the ET_DYN load_addr calculations, proceed normally.
+ 		 */
+ 		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
+ 			elf_flags |= MAP_FIXED;
+ 		} else if (loc->elf_ex.e_type == ET_DYN) {
+ 			/*
+ 			 * This logic is run once for the first LOAD Program
+ 			 * Header for ET_DYN binaries to calculate the
+ 			 * randomization (load_bias) for all the LOAD
+ 			 * Program Headers, and to calculate the entire
+ 			 * size of the ELF mapping (total_size). (Note that
+ 			 * load_addr_set is set to true later once the
+ 			 * initial mapping is performed.)
+ 			 *
+ 			 * There are effectively two types of ET_DYN
+ 			 * binaries: programs (i.e. PIE: ET_DYN with INTERP)
+ 			 * and loaders (ET_DYN without INTERP, since they
+ 			 * _are_ the ELF interpreter). The loaders must
+ 			 * be loaded away from programs since the program
+ 			 * may otherwise collide with the loader (especially
+ 			 * for ET_EXEC which does not have a randomized
+ 			 * position). For example to handle invocations of
+ 			 * "./ld.so someprog" to test out a new version of
+ 			 * the loader, the subsequent program that the
+ 			 * loader loads must avoid the loader itself, so
+ 			 * they cannot share the same load range. Sufficient
+ 			 * room for the brk must be allocated with the
+ 			 * loader as well, since brk must be available with
+ 			 * the loader.
+ 			 *
+ 			 * Therefore, programs are loaded offset from
+ 			 * ELF_ET_DYN_BASE and loaders are loaded into the
+ 			 * independently randomized mmap region (0 load_bias
+ 			 * without MAP_FIXED).
+ 			 */
+ 			if (interpreter) {
+ 				load_bias = ELF_ET_DYN_BASE;
+ 				if (current->flags & PF_RANDOMIZE)
+ 					load_bias += arch_mmap_rnd();
+ 				elf_flags |= MAP_FIXED;
+ 			} else
+ 				load_bias = 0;
+ 
+ 			/*
+ 			 * Since load_bias is used for all subsequent loading
+ 			 * calculations, we must lower it by the first vaddr
+ 			 * so that the remaining calculations based on the
+ 			 * ELF vaddrs will be correctly offset. The result
+ 			 * is then page aligned.
+ 			 */
+ 			load_bias = ELF_PAGESTART(load_bias - vaddr);
+ 
+ 			total_size = total_mapping_size(elf_phdata,
+ 							loc->elf_ex.e_phnum);
+ 			if (!total_size) {
+ 				retval = -EINVAL;
+ 				goto out_free_dentry;
+ 			}
+ 		}
+ 
+ 		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+ 				elf_prot, elf_flags, total_size);
+ 		if (BAD_ADDR(error)) {
+ 			retval = IS_ERR((void *)error) ?
+ 				PTR_ERR((void*)error) : -EINVAL;
+ 			goto out_free_dentry;
+ 		}
+ 
+ 		if (!load_addr_set) {
+ 			load_addr_set = 1;
+ 			load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ 			if (loc->elf_ex.e_type == ET_DYN) {
+ 				load_bias += error -
+ 				             ELF_PAGESTART(load_bias + vaddr);
+ 				load_addr += load_bias;
+ 				reloc_func_desc = load_bias;
+ 			}
+ 		}
+ 		k = elf_ppnt->p_vaddr;
+ 		if (k < start_code)
+ 			start_code = k;
+ 		if (start_data < k)
+ 			start_data = k;
+ 
+ 		/*
+ 		 * Check to see if the section's size will overflow the
+ 		 * allowed task size. Note that p_filesz must always be
+ 		 * <= p_memsz so it is only necessary to check p_memsz.
+ 		 */
+ 		if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
+ 		    elf_ppnt->p_memsz > TASK_SIZE ||
+ 		    TASK_SIZE - elf_ppnt->p_memsz < k) {
+ 			/* set_brk can never work. Avoid overflows. */
+ 			retval = -EINVAL;
+ 			goto out_free_dentry;
+ 		}
+ 
+ 		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
+ 
+ 		if (k > elf_bss)
+ 			elf_bss = k;
+ 		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
+ 			end_code = k;
+ 		if (end_data < k)
+ 			end_data = k;
+ 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
+ 		if (k > elf_brk) {
+ 			bss_prot = elf_prot;
+ 			elf_brk = k;
+ 		}
+ 	}
+ 
+ 	loc->elf_ex.e_entry += load_bias;
+ 	elf_bss += load_bias;
+ 	elf_brk += load_bias;
+ 	start_code += load_bias;
+ 	end_code += load_bias;
+ 	start_data += load_bias;
+ 	end_data += load_bias;
+ 
+ 	/* Calling set_brk effectively mmaps the pages that we need
+ 	 * for the bss and break sections.  We must do this before
+ 	 * mapping in the interpreter, to make sure it doesn't wind
+ 	 * up getting placed where the bss needs to go.
+ 	 */
+ 	retval = set_brk(elf_bss, elf_brk, bss_prot);
+ 	if (retval)
+ 		goto out_free_dentry;
+ 	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
+ 		retval = -EFAULT; /* Nobody gets to see this, but.. */
+ 		goto out_free_dentry;
+ 	}
+ 
+ 	if (interpreter) {
+ 		unsigned long interp_map_addr = 0;
+ 
+ 		elf_entry = load_elf_interp(&loc->interp_elf_ex,
+ 					    interpreter,
+ 					    &interp_map_addr,
+ 					    load_bias, interp_elf_phdata);
+ 		if (!IS_ERR((void *)elf_entry)) {
+ 			/*
+ 			 * load_elf_interp() returns relocation
+ 			 * adjustment
+ 			 */
+ 			interp_load_addr = elf_entry;
+ 			elf_entry += loc->interp_elf_ex.e_entry;
+ 		}
+ 		if (BAD_ADDR(elf_entry)) {
+ 			retval = IS_ERR((void *)elf_entry) ?
+ 					(int)elf_entry : -EINVAL;
+ 			goto out_free_dentry;
+ 		}
+ 		reloc_func_desc = interp_load_addr;
+ 
+ 		allow_write_access(interpreter);
+ 		fput(interpreter);
+ 	} else {
+ 		elf_entry = loc->elf_ex.e_entry;
+ 		if (BAD_ADDR(elf_entry)) {
+ 			retval = -EINVAL;
+ 			goto out_free_dentry;
+ 		}
+ 	}
+ 
+ 	kfree(interp_elf_phdata);
+ 	kfree(elf_phdata);
+ 
+ 	set_binfmt(&elf_format);
+ 
+ #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ 	retval = arch_setup_additional_pages(bprm, !!interpreter);
+ 	if (retval < 0)
+ 		goto out;
+ #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
+ 
+ 	retval = create_elf_tables(bprm, &loc->elf_ex,
+ 			  load_addr, interp_load_addr);
+ 	if (retval < 0)
+ 		goto out;
+ 	current->mm->end_code = end_code;
+ 	current->mm->start_code = start_code;
+ 	current->mm->start_data = start_data;
+ 	current->mm->end_data = end_data;
+ 	current->mm->start_stack = bprm->p;
+ 
+ 	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+ 		/*
+ 		 * For architectures with ELF randomization, when executing
+ 		 * a loader directly (i.e. no interpreter listed in ELF
+ 		 * headers), move the brk area out of the mmap region
+ 		 * (since it grows up, and may collide early with the stack
+ 		 * growing down), and into the unused ELF_ET_DYN_BASE region.
+ 		 */
+ 		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+ 		    loc->elf_ex.e_type == ET_DYN && !interpreter)
+ 			current->mm->brk = current->mm->start_brk =
+ 				ELF_ET_DYN_BASE;
+ 
+ 		current->mm->brk = current->mm->start_brk =
+ 			arch_randomize_brk(current->mm);
+ #ifdef compat_brk_randomized
+ 		current->brk_randomized = 1;
+ #endif
+ 	}
+ 
+ 	if (current->personality & MMAP_PAGE_ZERO) {
+ 		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
+ 		   and some applications "depend" upon this behavior.
+ 		   Since we do not have the power to recompile these, we
+ 		   emulate the SVr4 behavior. Sigh. */
+ 		error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
+ 				MAP_FIXED | MAP_PRIVATE, 0);
+ 	}
+ 
+ 	regs = current_pt_regs();
+ #ifdef ELF_PLAT_INIT
+ 	/*
+ 	 * The ABI may specify that certain registers be set up in special
+ 	 * ways (on i386 %edx is the address of a DT_FINI function, for
+ 	 * example.  In addition, it may also specify (eg, PowerPC64 ELF)
+ 	 * that the e_entry field is the address of the function descriptor
+ 	 * for the startup routine, rather than the address of the startup
+ 	 * routine itself.  This macro performs whatever initialization to
+ 	 * the regs structure is required as well as any relocations to the
+ 	 * function descriptor entries when executing dynamically links apps.
+ 	 */
+ 	ELF_PLAT_INIT(regs, reloc_func_desc);
+ #endif
+ 
+ 	finalize_exec(bprm);
+ 	start_thread(regs, elf_entry, bprm->p);
+ 	retval = 0;
+ out:
+ 	kfree(loc);
+ out_ret:
+ 	return retval;
+ 
+ 	/* error cleanup */
+ out_free_dentry:
+ 	kfree(interp_elf_phdata);
+ 	allow_write_access(interpreter);
+ 	if (interpreter)
+ 		fput(interpreter);
+ out_free_ph:
+ 	kfree(elf_phdata);
+ 	goto out;
+ }
+ 
+ #ifdef CONFIG_USELIB
+ /* This is really simpleminded and specialized - we are loading an
+    a.out library that is given an ELF header. */
+ static int load_elf_library(struct file *file)
+ {
+ 	struct elf_phdr *elf_phdata;
+ 	struct elf_phdr *eppnt;
+ 	unsigned long elf_bss, bss, len;
+ 	int retval, error, i, j;
+ 	struct elfhdr elf_ex;
+ 	loff_t pos = 0;
+ 
+ 	error = -ENOEXEC;
+ 	retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos);
+ 	if (retval != sizeof(elf_ex))
+ 		goto out;
+ 
+ 	if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ 		goto out;
+ 
+ 	/* First of all, some simple consistency checks */
+ 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
+ 	    !elf_check_arch(&elf_ex) || !file->f_op->mmap)
+ 		goto out;
+ 	if (elf_check_fdpic(&elf_ex))
+ 		goto out;
+ 
+ 	/* Now read in all of the header information */
+ 
+ 	j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
+ 	/* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
+ 
+ 	error = -ENOMEM;
+ 	elf_phdata = kmalloc(j, GFP_KERNEL);
+ 	if (!elf_phdata)
+ 		goto out;
+ 
+ 	eppnt = elf_phdata;
+ 	error = -ENOEXEC;
+ 	pos =  elf_ex.e_phoff;
+ 	retval = kernel_read(file, eppnt, j, &pos);
+ 	if (retval != j)
+ 		goto out_free_ph;
+ 
+ 	for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
+ 		if ((eppnt + i)->p_type == PT_LOAD)
+ 			j++;
+ 	if (j != 1)
+ 		goto out_free_ph;
+ 
+ 	while (eppnt->p_type != PT_LOAD)
+ 		eppnt++;
+ 
+ 	/* Now use mmap to map the library into memory. */
+ 	error = vm_mmap(file,
+ 			ELF_PAGESTART(eppnt->p_vaddr),
+ 			(eppnt->p_filesz +
+ 			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
+ 			PROT_READ | PROT_WRITE | PROT_EXEC,
+ 			MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE,
+ 			(eppnt->p_offset -
+ 			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
+ 	if (error != ELF_PAGESTART(eppnt->p_vaddr))
+ 		goto out_free_ph;
+ 
+ 	elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
+ 	if (padzero(elf_bss)) {
+ 		error = -EFAULT;
+ 		goto out_free_ph;
+ 	}
+ 
+ 	len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr);
+ 	bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr);
+ 	if (bss > len) {
+ 		error = vm_brk(len, bss - len);
+ 		if (error)
+ 			goto out_free_ph;
+ 	}
+ 	error = 0;
+ 
+ out_free_ph:
+ 	kfree(elf_phdata);
+ out:
+ 	return error;
+ }
+ #endif /* #ifdef CONFIG_USELIB */
+ 
+ #ifdef CONFIG_ELF_CORE
+ /*
+  * ELF core dumper
+  *
+  * Modelled on fs/exec.c:aout_core_dump()
+  * Jeremy Fitzhardinge <jeremy@sw.oz.au>
+  */
+ 
+ /*
+  * The purpose of always_dump_vma() is to make sure that special kernel mappings
+  * that are useful for post-mortem analysis are included in every core dump.
+  * In that way we ensure that the core dump is fully interpretable later
+  * without matching up the same kernel and hardware config to see what PC values
+  * meant. These special mappings include - vDSO, vsyscall, and other
+  * architecture specific mappings
+  */
+ static bool always_dump_vma(struct vm_area_struct *vma)
+ {
+ 	/* Any vsyscall mappings? */
+ 	if (vma == get_gate_vma(vma->vm_mm))
+ 		return true;
+ 
+ 	/*
+ 	 * Assume that all vmas with a .name op should always be dumped.
+ 	 * If this changes, a new vm_ops field can easily be added.
+ 	 */
+ 	if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
+ 		return true;
+ 
+ 	/*
+ 	 * arch_vma_name() returns non-NULL for special architecture mappings,
+ 	 * such as vDSO sections.
+ 	 */
+ 	if (arch_vma_name(vma))
+ 		return true;
+ 
+ 	return false;
+ }
+ 
+ /*
+  * Decide what to dump of a segment, part, all or none.
+  */
+ static unsigned long vma_dump_size(struct vm_area_struct *vma,
+ 				   unsigned long mm_flags)
+ {
+ #define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+ 
+ 	/* always dump the vdso and vsyscall sections */
+ 	if (always_dump_vma(vma))
+ 		goto whole;
+ 
+ 	if (vma->vm_flags & VM_DONTDUMP)
+ 		return 0;
+ 
+ 	/* support for DAX */
+ 	if (vma_is_dax(vma)) {
+ 		if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ 			goto whole;
+ 		if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ 			goto whole;
+ 		return 0;
+ 	}
+ 
+ 	/* Hugetlb memory check */
+ 	if (vma->vm_flags & VM_HUGETLB) {
+ 		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+ 			goto whole;
+ 		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+ 			goto whole;
+ 		return 0;
+ 	}
+ 
+ 	/* Do not dump I/O mapped devices or special mappings */
+ 	if (vma->vm_flags & VM_IO)
+ 		return 0;
+ 
+ 	/* By default, dump shared memory if mapped from an anonymous file. */
+ 	if (vma->vm_flags & VM_SHARED) {
+ 		if (file_inode(vma->vm_file)->i_nlink == 0 ?
+ 		    FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
+ 			goto whole;
+ 		return 0;
+ 	}
+ 
+ 	/* Dump segments that have been written to.  */
+ 	if (vma->anon_vma && FILTER(ANON_PRIVATE))
+ 		goto whole;
+ 	if (vma->vm_file == NULL)
+ 		return 0;
+ 
+ 	if (FILTER(MAPPED_PRIVATE))
+ 		goto whole;
+ 
+ 	/*
+ 	 * If this looks like the beginning of a DSO or executable mapping,
+ 	 * check for an ELF header.  If we find one, dump the first page to
+ 	 * aid in determining what was mapped here.
+ 	 */
+ 	if (FILTER(ELF_HEADERS) &&
+ 	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ 		u32 __user *header = (u32 __user *) vma->vm_start;
+ 		u32 word;
+ 		mm_segment_t fs = get_fs();
+ 		/*
+ 		 * Doing it this way gets the constant folded by GCC.
+ 		 */
+ 		union {
+ 			u32 cmp;
+ 			char elfmag[SELFMAG];
+ 		} magic;
+ 		BUILD_BUG_ON(SELFMAG != sizeof word);
+ 		magic.elfmag[EI_MAG0] = ELFMAG0;
+ 		magic.elfmag[EI_MAG1] = ELFMAG1;
+ 		magic.elfmag[EI_MAG2] = ELFMAG2;
+ 		magic.elfmag[EI_MAG3] = ELFMAG3;
+ 		/*
+ 		 * Switch to the user "segment" for get_user(),
+ 		 * then put back what elf_core_dump() had in place.
+ 		 */
+ 		set_fs(USER_DS);
+ 		if (unlikely(get_user(word, header)))
+ 			word = 0;
+ 		set_fs(fs);
+ 		if (word == magic.cmp)
+ 			return PAGE_SIZE;
+ 	}
+ 
+ #undef	FILTER
+ 
+ 	return 0;
+ 
+ whole:
+ 	return vma->vm_end - vma->vm_start;
+ }
+ 
+ /* An ELF note in memory */
+ struct memelfnote
+ {
+ 	const char *name;
+ 	int type;
+ 	unsigned int datasz;
+ 	void *data;
+ };
+ 
+ static int notesize(struct memelfnote *en)
+ {
+ 	int sz;
+ 
+ 	sz = sizeof(struct elf_note);
+ 	sz += roundup(strlen(en->name) + 1, 4);
+ 	sz += roundup(en->datasz, 4);
+ 
+ 	return sz;
+ }
+ 
+ static int writenote(struct memelfnote *men, struct coredump_params *cprm)
+ {
+ 	struct elf_note en;
+ 	en.n_namesz = strlen(men->name) + 1;
+ 	en.n_descsz = men->datasz;
+ 	en.n_type = men->type;
+ 
+ 	return dump_emit(cprm, &en, sizeof(en)) &&
+ 	    dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+ 	    dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
+ }
+ 
+ static void fill_elf_header(struct elfhdr *elf, int segs,
+ 			    u16 machine, u32 flags)
+ {
+ 	memset(elf, 0, sizeof(*elf));
+ 
+ 	memcpy(elf->e_ident, ELFMAG, SELFMAG);
+ 	elf->e_ident[EI_CLASS] = ELF_CLASS;
+ 	elf->e_ident[EI_DATA] = ELF_DATA;
+ 	elf->e_ident[EI_VERSION] = EV_CURRENT;
+ 	elf->e_ident[EI_OSABI] = ELF_OSABI;
+ 
+ 	elf->e_type = ET_CORE;
+ 	elf->e_machine = machine;
+ 	elf->e_version = EV_CURRENT;
+ 	elf->e_phoff = sizeof(struct elfhdr);
+ 	elf->e_flags = flags;
+ 	elf->e_ehsize = sizeof(struct elfhdr);
+ 	elf->e_phentsize = sizeof(struct elf_phdr);
+ 	elf->e_phnum = segs;
+ }
+ 
+ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
+ {
+ 	phdr->p_type = PT_NOTE;
+ 	phdr->p_offset = offset;
+ 	phdr->p_vaddr = 0;
+ 	phdr->p_paddr = 0;
+ 	phdr->p_filesz = sz;
+ 	phdr->p_memsz = 0;
+ 	phdr->p_flags = 0;
+ 	phdr->p_align = 0;
+ }
+ 
+ static void fill_note(struct memelfnote *note, const char *name, int type, 
+ 		unsigned int sz, void *data)
+ {
+ 	note->name = name;
+ 	note->type = type;
+ 	note->datasz = sz;
+ 	note->data = data;
+ }
+ 
+ /*
+  * fill up all the fields in prstatus from the given task struct, except
+  * registers which need to be filled up separately.
+  */
+ static void fill_prstatus(struct elf_prstatus *prstatus,
+ 		struct task_struct *p, long signr)
+ {
+ 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+ 	prstatus->pr_sigpend = p->pending.signal.sig[0];
+ 	prstatus->pr_sighold = p->blocked.sig[0];
+ 	rcu_read_lock();
+ 	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+ 	rcu_read_unlock();
+ 	prstatus->pr_pid = task_pid_vnr(p);
+ 	prstatus->pr_pgrp = task_pgrp_vnr(p);
+ 	prstatus->pr_sid = task_session_vnr(p);
+ 	if (thread_group_leader(p)) {
+ 		struct task_cputime cputime;
+ 
+ 		/*
+ 		 * This is the record for the group leader.  It shows the
+ 		 * group-wide total, not its individual thread total.
+ 		 */
+ 		thread_group_cputime(p, &cputime);
+ 		prstatus->pr_utime = ns_to_timeval(cputime.utime);
+ 		prstatus->pr_stime = ns_to_timeval(cputime.stime);
+ 	} else {
+ 		u64 utime, stime;
+ 
+ 		task_cputime(p, &utime, &stime);
+ 		prstatus->pr_utime = ns_to_timeval(utime);
+ 		prstatus->pr_stime = ns_to_timeval(stime);
+ 	}
+ 
+ 	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
+ 	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
+ }
+ 
+ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
+ 		       struct mm_struct *mm)
+ {
+ 	const struct cred *cred;
+ 	unsigned int i, len;
+ 	
+ 	/* first copy the parameters from user space */
+ 	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+ 
+ 	len = mm->arg_end - mm->arg_start;
+ 	if (len >= ELF_PRARGSZ)
+ 		len = ELF_PRARGSZ-1;
+ 	if (copy_from_user(&psinfo->pr_psargs,
+ 		           (const char __user *)mm->arg_start, len))
+ 		return -EFAULT;
+ 	for(i = 0; i < len; i++)
+ 		if (psinfo->pr_psargs[i] == 0)
+ 			psinfo->pr_psargs[i] = ' ';
+ 	psinfo->pr_psargs[len] = 0;
+ 
+ 	rcu_read_lock();
+ 	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+ 	rcu_read_unlock();
+ 	psinfo->pr_pid = task_pid_vnr(p);
+ 	psinfo->pr_pgrp = task_pgrp_vnr(p);
+ 	psinfo->pr_sid = task_session_vnr(p);
+ 
+ 	i = p->state ? ffz(~p->state) + 1 : 0;
+ 	psinfo->pr_state = i;
+ 	psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
+ 	psinfo->pr_zomb = psinfo->pr_sname == 'Z';
+ 	psinfo->pr_nice = task_nice(p);
+ 	psinfo->pr_flag = p->flags;
+ 	rcu_read_lock();
+ 	cred = __task_cred(p);
+ 	SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
+ 	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
+ 	rcu_read_unlock();
+ 	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+ 	
+ 	return 0;
+ }
+ 
+ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
+ {
+ 	elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv;
+ 	int i = 0;
+ 	do
+ 		i += 2;
+ 	while (auxv[i - 2] != AT_NULL);
+ 	fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ }
+ 
+ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
+ 		const kernel_siginfo_t *siginfo)
+ {
+ 	mm_segment_t old_fs = get_fs();
+ 	set_fs(KERNEL_DS);
+ 	copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo);
+ 	set_fs(old_fs);
+ 	fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+ }
+ 
+ #define MAX_FILE_NOTE_SIZE (4*1024*1024)
+ /*
+  * Format of NT_FILE note:
+  *
+  * long count     -- how many files are mapped
+  * long page_size -- units for file_ofs
+  * array of [COUNT] elements of
+  *   long start
+  *   long end
+  *   long file_ofs
+  * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
+  */
+ static int fill_files_note(struct memelfnote *note)
+ {
+ 	struct vm_area_struct *vma;
+ 	unsigned count, size, names_ofs, remaining, n;
+ 	user_long_t *data;
+ 	user_long_t *start_end_ofs;
+ 	char *name_base, *name_curpos;
+ 
+ 	/* *Estimated* file count and total data size needed */
+ 	count = current->mm->map_count;
+ 	if (count > UINT_MAX / 64)
+ 		return -EINVAL;
+ 	size = count * 64;
+ 
+ 	names_ofs = (2 + 3 * count) * sizeof(data[0]);
+  alloc:
+ 	if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+ 		return -EINVAL;
+ 	size = round_up(size, PAGE_SIZE);
+ 	data = kvmalloc(size, GFP_KERNEL);
+ 	if (ZERO_OR_NULL_PTR(data))
+ 		return -ENOMEM;
+ 
+ 	start_end_ofs = data + 2;
+ 	name_base = name_curpos = ((char *)data) + names_ofs;
+ 	remaining = size - names_ofs;
+ 	count = 0;
+ 	for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
+ 		struct file *file;
+ 		const char *filename;
+ 
+ 		file = vma->vm_file;
+ 		if (!file)
+ 			continue;
+ 		filename = file_path(file, name_curpos, remaining);
+ 		if (IS_ERR(filename)) {
+ 			if (PTR_ERR(filename) == -ENAMETOOLONG) {
+ 				kvfree(data);
+ 				size = size * 5 / 4;
+ 				goto alloc;
+ 			}
+ 			continue;
+ 		}
+ 
+ 		/* file_path() fills at the end, move name down */
+ 		/* n = strlen(filename) + 1: */
+ 		n = (name_curpos + remaining) - filename;
+ 		remaining = filename - name_curpos;
+ 		memmove(name_curpos, filename, n);
+ 		name_curpos += n;
+ 
+ 		*start_end_ofs++ = vma->vm_start;
+ 		*start_end_ofs++ = vma->vm_end;
+ 		*start_end_ofs++ = vma->vm_pgoff;
+ 		count++;
+ 	}
+ 
+ 	/* Now we know exact count of files, can store it */
+ 	data[0] = count;
+ 	data[1] = PAGE_SIZE;
+ 	/*
+ 	 * Count usually is less than current->mm->map_count,
+ 	 * we need to move filenames down.
+ 	 */
+ 	n = current->mm->map_count - count;
+ 	if (n != 0) {
+ 		unsigned shift_bytes = n * 3 * sizeof(data[0]);
+ 		memmove(name_base - shift_bytes, name_base,
+ 			name_curpos - name_base);
+ 		name_curpos -= shift_bytes;
+ 	}
+ 
+ 	size = name_curpos - (char *)data;
+ 	fill_note(note, "CORE", NT_FILE, size, data);
+ 	return 0;
+ }
+ 
+ #ifdef CORE_DUMP_USE_REGSET
+ #include <linux/regset.h>
+ 
+ struct elf_thread_core_info {
+ 	struct elf_thread_core_info *next;
+ 	struct task_struct *task;
+ 	struct elf_prstatus prstatus;
+ 	struct memelfnote notes[0];
+ };
+ 
+ struct elf_note_info {
+ 	struct elf_thread_core_info *thread;
+ 	struct memelfnote psinfo;
+ 	struct memelfnote signote;
+ 	struct memelfnote auxv;
+ 	struct memelfnote files;
+ 	user_siginfo_t csigdata;
+ 	size_t size;
+ 	int thread_notes;
+ };
+ 
+ /*
+  * When a regset has a writeback hook, we call it on each thread before
+  * dumping user memory.  On register window machines, this makes sure the
+  * user memory backing the register data is up to date before we read it.
+  */
+ static void do_thread_regset_writeback(struct task_struct *task,
+ 				       const struct user_regset *regset)
+ {
+ 	if (regset->writeback)
+ 		regset->writeback(task, regset, 1);
+ }
+ 
+ #ifndef PRSTATUS_SIZE
+ #define PRSTATUS_SIZE(S, R) sizeof(S)
+ #endif
+ 
+ #ifndef SET_PR_FPVALID
+ #define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V))
+ #endif
+ 
+ static int fill_thread_core_info(struct elf_thread_core_info *t,
+ 				 const struct user_regset_view *view,
+ 				 long signr, size_t *total)
+ {
+ 	unsigned int i;
+ 	unsigned int regset0_size = regset_size(t->task, &view->regsets[0]);
+ 
+ 	/*
+ 	 * NT_PRSTATUS is the one special case, because the regset data
+ 	 * goes into the pr_reg field inside the note contents, rather
+ 	 * than being the whole note contents.  We fill the reset in here.
+ 	 * We assume that regset 0 is NT_PRSTATUS.
+ 	 */
+ 	fill_prstatus(&t->prstatus, t->task, signr);
+ 	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset0_size,
+ 				    &t->prstatus.pr_reg, NULL);
+ 
+ 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+ 		  PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus);
+ 	*total += notesize(&t->notes[0]);
+ 
+ 	do_thread_regset_writeback(t->task, &view->regsets[0]);
+ 
+ 	/*
+ 	 * Each other regset might generate a note too.  For each regset
+ 	 * that has no core_note_type or is inactive, we leave t->notes[i]
+ 	 * all zero and we'll know to skip writing it later.
+ 	 */
+ 	for (i = 1; i < view->n; ++i) {
+ 		const struct user_regset *regset = &view->regsets[i];
+ 		do_thread_regset_writeback(t->task, regset);
+ 		if (regset->core_note_type && regset->get &&
+ 		    (!regset->active || regset->active(t->task, regset) > 0)) {
+ 			int ret;
+ 			size_t size = regset_size(t->task, regset);
+ 			void *data = kzalloc(size, GFP_KERNEL);
+ 			if (unlikely(!data))
+ 				return 0;
+ 			ret = regset->get(t->task, regset,
+ 					  0, size, data, NULL);
+ 			if (unlikely(ret))
+ 				kfree(data);
+ 			else {
+ 				if (regset->core_note_type != NT_PRFPREG)
+ 					fill_note(&t->notes[i], "LINUX",
+ 						  regset->core_note_type,
+ 						  size, data);
+ 				else {
+ 					SET_PR_FPVALID(&t->prstatus,
+ 							1, regset0_size);
+ 					fill_note(&t->notes[i], "CORE",
+ 						  NT_PRFPREG, size, data);
+ 				}
+ 				*total += notesize(&t->notes[i]);
+ 			}
+ 		}
+ 	}
+ 
+ 	return 1;
+ }
+ 
+ static int fill_note_info(struct elfhdr *elf, int phdrs,
+ 			  struct elf_note_info *info,
+ 			  const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ {
+ 	struct task_struct *dump_task = current;
+ 	const struct user_regset_view *view = task_user_regset_view(dump_task);
+ 	struct elf_thread_core_info *t;
+ 	struct elf_prpsinfo *psinfo;
+ 	struct core_thread *ct;
+ 	unsigned int i;
+ 
+ 	info->size = 0;
+ 	info->thread = NULL;
+ 
+ 	psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
+ 	if (psinfo == NULL) {
+ 		info->psinfo.data = NULL; /* So we don't free this wrongly */
+ 		return 0;
+ 	}
+ 
+ 	fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ 
+ 	/*
+ 	 * Figure out how many notes we're going to need for each thread.
+ 	 */
+ 	info->thread_notes = 0;
+ 	for (i = 0; i < view->n; ++i)
+ 		if (view->regsets[i].core_note_type != 0)
+ 			++info->thread_notes;
+ 
+ 	/*
+ 	 * Sanity check.  We rely on regset 0 being in NT_PRSTATUS,
+ 	 * since it is our one special case.
+ 	 */
+ 	if (unlikely(info->thread_notes == 0) ||
+ 	    unlikely(view->regsets[0].core_note_type != NT_PRSTATUS)) {
+ 		WARN_ON(1);
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * Initialize the ELF file header.
+ 	 */
+ 	fill_elf_header(elf, phdrs,
+ 			view->e_machine, view->e_flags);
+ 
+ 	/*
+ 	 * Allocate a structure for each thread.
+ 	 */
+ 	for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+ 		t = kzalloc(offsetof(struct elf_thread_core_info,
+ 				     notes[info->thread_notes]),
+ 			    GFP_KERNEL);
+ 		if (unlikely(!t))
+ 			return 0;
+ 
+ 		t->task = ct->task;
+ 		if (ct->task == dump_task || !info->thread) {
+ 			t->next = info->thread;
+ 			info->thread = t;
+ 		} else {
+ 			/*
+ 			 * Make sure to keep the original task at
+ 			 * the head of the list.
+ 			 */
+ 			t->next = info->thread->next;
+ 			info->thread->next = t;
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Now fill in each thread's information.
+ 	 */
+ 	for (t = info->thread; t != NULL; t = t->next)
+ 		if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ 			return 0;
+ 
+ 	/*
+ 	 * Fill in the two process-wide notes.
+ 	 */
+ 	fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
+ 	info->size += notesize(&info->psinfo);
+ 
+ 	fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ 	info->size += notesize(&info->signote);
+ 
+ 	fill_auxv_note(&info->auxv, current->mm);
+ 	info->size += notesize(&info->auxv);
+ 
+ 	if (fill_files_note(&info->files) == 0)
+ 		info->size += notesize(&info->files);
+ 
+ 	return 1;
+ }
+ 
+ static size_t get_note_info_size(struct elf_note_info *info)
+ {
+ 	return info->size;
+ }
+ 
+ /*
+  * Write all the notes for each thread.  When writing the first thread, the
+  * process-wide notes are interleaved after the first thread-specific note.
+  */
+ static int write_note_info(struct elf_note_info *info,
+ 			   struct coredump_params *cprm)
+ {
+ 	bool first = true;
+ 	struct elf_thread_core_info *t = info->thread;
+ 
+ 	do {
+ 		int i;
+ 
+ 		if (!writenote(&t->notes[0], cprm))
+ 			return 0;
+ 
+ 		if (first && !writenote(&info->psinfo, cprm))
+ 			return 0;
+ 		if (first && !writenote(&info->signote, cprm))
+ 			return 0;
+ 		if (first && !writenote(&info->auxv, cprm))
+ 			return 0;
+ 		if (first && info->files.data &&
+ 				!writenote(&info->files, cprm))
+ 			return 0;
+ 
+ 		for (i = 1; i < info->thread_notes; ++i)
+ 			if (t->notes[i].data &&
+ 			    !writenote(&t->notes[i], cprm))
+ 				return 0;
+ 
+ 		first = false;
+ 		t = t->next;
+ 	} while (t);
+ 
+ 	return 1;
+ }
+ 
+ static void free_note_info(struct elf_note_info *info)
+ {
+ 	struct elf_thread_core_info *threads = info->thread;
+ 	while (threads) {
+ 		unsigned int i;
+ 		struct elf_thread_core_info *t = threads;
+ 		threads = t->next;
+ 		WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
+ 		for (i = 1; i < info->thread_notes; ++i)
+ 			kfree(t->notes[i].data);
+ 		kfree(t);
+ 	}
+ 	kfree(info->psinfo.data);
+ 	kvfree(info->files.data);
+ }
+ 
+ #else
+ 
+ /* Here is the structure in which status of each thread is captured. */
+ struct elf_thread_status
+ {
+ 	struct list_head list;
+ 	struct elf_prstatus prstatus;	/* NT_PRSTATUS */
+ 	elf_fpregset_t fpu;		/* NT_PRFPREG */
+ 	struct task_struct *thread;
+ #ifdef ELF_CORE_COPY_XFPREGS
+ 	elf_fpxregset_t xfpu;		/* ELF_CORE_XFPREG_TYPE */
+ #endif
+ 	struct memelfnote notes[3];
+ 	int num_notes;
+ };
+ 
+ /*
+  * In order to add the specific thread information for the elf file format,
+  * we need to keep a linked list of every threads pr_status and then create
+  * a single section for them in the final core file.
+  */
+ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
+ {
+ 	int sz = 0;
+ 	struct task_struct *p = t->thread;
+ 	t->num_notes = 0;
+ 
+ 	fill_prstatus(&t->prstatus, p, signr);
+ 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
+ 	
+ 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ 		  &(t->prstatus));
+ 	t->num_notes++;
+ 	sz += notesize(&t->notes[0]);
+ 
+ 	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
+ 								&t->fpu))) {
+ 		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+ 			  &(t->fpu));
+ 		t->num_notes++;
+ 		sz += notesize(&t->notes[1]);
+ 	}
+ 
+ #ifdef ELF_CORE_COPY_XFPREGS
+ 	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
+ 		fill_note(&t->notes[2], "LINUX", ELF_CORE_XFPREG_TYPE,
+ 			  sizeof(t->xfpu), &t->xfpu);
+ 		t->num_notes++;
+ 		sz += notesize(&t->notes[2]);
+ 	}
+ #endif	
+ 	return sz;
+ }
+ 
+ struct elf_note_info {
+ 	struct memelfnote *notes;
+ 	struct memelfnote *notes_files;
+ 	struct elf_prstatus *prstatus;	/* NT_PRSTATUS */
+ 	struct elf_prpsinfo *psinfo;	/* NT_PRPSINFO */
+ 	struct list_head thread_list;
+ 	elf_fpregset_t *fpu;
+ #ifdef ELF_CORE_COPY_XFPREGS
+ 	elf_fpxregset_t *xfpu;
+ #endif
+ 	user_siginfo_t csigdata;
+ 	int thread_status_size;
+ 	int numnote;
+ };
+ 
+ static int elf_note_info_init(struct elf_note_info *info)
+ {
+ 	memset(info, 0, sizeof(*info));
+ 	INIT_LIST_HEAD(&info->thread_list);
+ 
+ 	/* Allocate space for ELF notes */
+ 	info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL);
+ 	if (!info->notes)
+ 		return 0;
+ 	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
+ 	if (!info->psinfo)
+ 		return 0;
+ 	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
+ 	if (!info->prstatus)
+ 		return 0;
+ 	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
+ 	if (!info->fpu)
+ 		return 0;
+ #ifdef ELF_CORE_COPY_XFPREGS
+ 	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
+ 	if (!info->xfpu)
+ 		return 0;
+ #endif
+ 	return 1;
+ }
+ 
+ static int fill_note_info(struct elfhdr *elf, int phdrs,
+ 			  struct elf_note_info *info,
+ 			  const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ {
+ 	struct core_thread *ct;
+ 	struct elf_thread_status *ets;
+ 
+ 	if (!elf_note_info_init(info))
+ 		return 0;
+ 
+ 	for (ct = current->mm->core_state->dumper.next;
+ 					ct; ct = ct->next) {
+ 		ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+ 		if (!ets)
+ 			return 0;
+ 
+ 		ets->thread = ct->task;
+ 		list_add(&ets->list, &info->thread_list);
+ 	}
+ 
+ 	list_for_each_entry(ets, &info->thread_list, list) {
+ 		int sz;
+ 
+ 		sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ 		info->thread_status_size += sz;
+ 	}
+ 	/* now collect the dump for the current */
+ 	memset(info->prstatus, 0, sizeof(*info->prstatus));
+ 	fill_prstatus(info->prstatus, current, siginfo->si_signo);
+ 	elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ 
+ 	/* Set up header */
+ 	fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+ 
+ 	/*
+ 	 * Set up the notes in similar form to SVR4 core dumps made
+ 	 * with info from their /proc.
+ 	 */
+ 
+ 	fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
+ 		  sizeof(*info->prstatus), info->prstatus);
+ 	fill_psinfo(info->psinfo, current->group_leader, current->mm);
+ 	fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
+ 		  sizeof(*info->psinfo), info->psinfo);
+ 
+ 	fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ 	fill_auxv_note(info->notes + 3, current->mm);
+ 	info->numnote = 4;
+ 
+ 	if (fill_files_note(info->notes + info->numnote) == 0) {
+ 		info->notes_files = info->notes + info->numnote;
+ 		info->numnote++;
+ 	}
+ 
+ 	/* Try to dump the FPU. */
+ 	info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
+ 							       info->fpu);
+ 	if (info->prstatus->pr_fpvalid)
+ 		fill_note(info->notes + info->numnote++,
+ 			  "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
+ #ifdef ELF_CORE_COPY_XFPREGS
+ 	if (elf_core_copy_task_xfpregs(current, info->xfpu))
+ 		fill_note(info->notes + info->numnote++,
+ 			  "LINUX", ELF_CORE_XFPREG_TYPE,
+ 			  sizeof(*info->xfpu), info->xfpu);
+ #endif
+ 
+ 	return 1;
+ }
+ 
+ static size_t get_note_info_size(struct elf_note_info *info)
+ {
+ 	int sz = 0;
+ 	int i;
+ 
+ 	for (i = 0; i < info->numnote; i++)
+ 		sz += notesize(info->notes + i);
+ 
+ 	sz += info->thread_status_size;
+ 
+ 	return sz;
+ }
+ 
+ static int write_note_info(struct elf_note_info *info,
+ 			   struct coredump_params *cprm)
+ {
+ 	struct elf_thread_status *ets;
+ 	int i;
+ 
+ 	for (i = 0; i < info->numnote; i++)
+ 		if (!writenote(info->notes + i, cprm))
+ 			return 0;
+ 
+ 	/* write out the thread status notes section */
+ 	list_for_each_entry(ets, &info->thread_list, list) {
+ 		for (i = 0; i < ets->num_notes; i++)
+ 			if (!writenote(&ets->notes[i], cprm))
+ 				return 0;
+ 	}
+ 
+ 	return 1;
+ }
+ 
+ static void free_note_info(struct elf_note_info *info)
+ {
+ 	while (!list_empty(&info->thread_list)) {
+ 		struct list_head *tmp = info->thread_list.next;
+ 		list_del(tmp);
+ 		kfree(list_entry(tmp, struct elf_thread_status, list));
+ 	}
+ 
+ 	/* Free data possibly allocated by fill_files_note(): */
+ 	if (info->notes_files)
+ 		kvfree(info->notes_files->data);
+ 
+ 	kfree(info->prstatus);
+ 	kfree(info->psinfo);
+ 	kfree(info->notes);
+ 	kfree(info->fpu);
+ #ifdef ELF_CORE_COPY_XFPREGS
+ 	kfree(info->xfpu);
+ #endif
+ }
+ 
+ #endif
+ 
+ static struct vm_area_struct *first_vma(struct task_struct *tsk,
+ 					struct vm_area_struct *gate_vma)
+ {
+ 	struct vm_area_struct *ret = tsk->mm->mmap;
+ 
+ 	if (ret)
+ 		return ret;
+ 	return gate_vma;
+ }
+ /*
+  * Helper function for iterating across a vma list.  It ensures that the caller
+  * will visit `gate_vma' prior to terminating the search.
+  */
+ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+ 					struct vm_area_struct *gate_vma)
+ {
+ 	struct vm_area_struct *ret;
+ 
+ 	ret = this_vma->vm_next;
+ 	if (ret)
+ 		return ret;
+ 	if (this_vma == gate_vma)
+ 		return NULL;
+ 	return gate_vma;
+ }
+ 
+ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
+ 			     elf_addr_t e_shoff, int segs)
+ {
+ 	elf->e_shoff = e_shoff;
+ 	elf->e_shentsize = sizeof(*shdr4extnum);
+ 	elf->e_shnum = 1;
+ 	elf->e_shstrndx = SHN_UNDEF;
+ 
+ 	memset(shdr4extnum, 0, sizeof(*shdr4extnum));
+ 
+ 	shdr4extnum->sh_type = SHT_NULL;
+ 	shdr4extnum->sh_size = elf->e_shnum;
+ 	shdr4extnum->sh_link = elf->e_shstrndx;
+ 	shdr4extnum->sh_info = segs;
+ }
+ 
+ /*
+  * Actual dumper
+  *
+  * This is a two-pass process; first we find the offsets of the bits,
+  * and then they are actually written out.  If we run out of core limit
+  * we just truncate.
+  */
+ static int elf_core_dump(struct coredump_params *cprm)
+ {
+ 	int has_dumped = 0;
+ 	mm_segment_t fs;
+ 	int segs, i;
+ 	size_t vma_data_size = 0;
+ 	struct vm_area_struct *vma, *gate_vma;
+ 	struct elfhdr *elf = NULL;
+ 	loff_t offset = 0, dataoff;
+ 	struct elf_note_info info = { };
+ 	struct elf_phdr *phdr4note = NULL;
+ 	struct elf_shdr *shdr4extnum = NULL;
+ 	Elf_Half e_phnum;
+ 	elf_addr_t e_shoff;
+ 	elf_addr_t *vma_filesz = NULL;
+ 
+ 	/*
+ 	 * We no longer stop all VM operations.
+ 	 * 
+ 	 * This is because those proceses that could possibly change map_count
+ 	 * or the mmap / vma pages are now blocked in do_exit on current
+ 	 * finishing this core dump.
+ 	 *
+ 	 * Only ptrace can touch these memory addresses, but it doesn't change
+ 	 * the map_count or the pages allocated. So no possibility of crashing
+ 	 * exists while dumping the mm->vm_next areas to the core file.
+ 	 */
+   
+ 	/* alloc memory for large data structures: too large to be on stack */
+ 	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+ 	if (!elf)
+ 		goto out;
+ 	/*
+ 	 * The number of segs are recored into ELF header as 16bit value.
+ 	 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
+ 	 */
+ 	segs = current->mm->map_count;
+ 	segs += elf_core_extra_phdrs();
+ 
+ 	gate_vma = get_gate_vma(current->mm);
+ 	if (gate_vma != NULL)
+ 		segs++;
+ 
+ 	/* for notes section */
+ 	segs++;
+ 
+ 	/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
+ 	 * this, kernel supports extended numbering. Have a look at
+ 	 * include/linux/elf.h for further information. */
+ 	e_phnum = segs > PN_XNUM ? PN_XNUM : segs;
+ 
+ 	/*
+ 	 * Collect all the non-memory information about the process for the
+ 	 * notes.  This also sets up the file header.
+ 	 */
+ 	if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ 		goto cleanup;
+ 
+ 	has_dumped = 1;
+ 
+ 	fs = get_fs();
+ 	set_fs(KERNEL_DS);
+ 
+ 	offset += sizeof(*elf);				/* Elf header */
+ 	offset += segs * sizeof(struct elf_phdr);	/* Program headers */
+ 
+ 	/* Write notes phdr entry */
+ 	{
+ 		size_t sz = get_note_info_size(&info);
+ 
+ 		sz += elf_coredump_extra_notes_size();
+ 
+ 		phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
+ 		if (!phdr4note)
+ 			goto end_coredump;
+ 
+ 		fill_elf_note_phdr(phdr4note, sz, offset);
+ 		offset += sz;
+ 	}
+ 
+ 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
+ 
+ 	if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
+ 		goto end_coredump;
+ 	vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)),
+ 			      GFP_KERNEL);
+ 	if (ZERO_OR_NULL_PTR(vma_filesz))
+ 		goto end_coredump;
+ 
+ 	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+ 			vma = next_vma(vma, gate_vma)) {
+ 		unsigned long dump_size;
+ 
+ 		dump_size = vma_dump_size(vma, cprm->mm_flags);
+ 		vma_filesz[i++] = dump_size;
+ 		vma_data_size += dump_size;
+ 	}
+ 
+ 	offset += vma_data_size;
+ 	offset += elf_core_extra_data_size();
+ 	e_shoff = offset;
+ 
+ 	if (e_phnum == PN_XNUM) {
+ 		shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
+ 		if (!shdr4extnum)
+ 			goto end_coredump;
+ 		fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
+ 	}
+ 
+ 	offset = dataoff;
+ 
+ 	if (!dump_emit(cprm, elf, sizeof(*elf)))
+ 		goto end_coredump;
+ 
+ 	if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
+ 		goto end_coredump;
+ 
+ 	/* Write program headers for segments dump */
+ 	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+ 			vma = next_vma(vma, gate_vma)) {
+ 		struct elf_phdr phdr;
+ 
+ 		phdr.p_type = PT_LOAD;
+ 		phdr.p_offset = offset;
+ 		phdr.p_vaddr = vma->vm_start;
+ 		phdr.p_paddr = 0;
+ 		phdr.p_filesz = vma_filesz[i++];
+ 		phdr.p_memsz = vma->vm_end - vma->vm_start;
+ 		offset += phdr.p_filesz;
+ 		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
+ 		if (vma->vm_flags & VM_WRITE)
+ 			phdr.p_flags |= PF_W;
+ 		if (vma->vm_flags & VM_EXEC)
+ 			phdr.p_flags |= PF_X;
+ 		phdr.p_align = ELF_EXEC_PAGESIZE;
+ 
+ 		if (!dump_emit(cprm, &phdr, sizeof(phdr)))
+ 			goto end_coredump;
+ 	}
+ 
+ 	if (!elf_core_write_extra_phdrs(cprm, offset))
+ 		goto end_coredump;
+ 
+  	/* write out the notes section */
+ 	if (!write_note_info(&info, cprm))
+ 		goto end_coredump;
+ 
+ 	if (elf_coredump_extra_notes_write(cprm))
+ 		goto end_coredump;
+ 
+ 	/* Align to page */
+ 	if (!dump_skip(cprm, dataoff - cprm->pos))
+ 		goto end_coredump;
+ 
+ 	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+ 			vma = next_vma(vma, gate_vma)) {
+ 		unsigned long addr;
+ 		unsigned long end;
+ 
+ 		end = vma->vm_start + vma_filesz[i++];
+ 
+ 		for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
+ 			struct page *page;
+ 			int stop;
+ 
+ 			page = get_dump_page(addr);
+ 			if (page) {
+ 				void *kaddr = kmap(page);
+ 				stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
+ 				kunmap(page);
+ 				put_page(page);
+ 			} else
+ 				stop = !dump_skip(cprm, PAGE_SIZE);
+ 			if (stop)
+ 				goto end_coredump;
+ 		}
+ 	}
+ 	dump_truncate(cprm);
+ 
+ 	if (!elf_core_write_extra_data(cprm))
+ 		goto end_coredump;
+ 
+ 	if (e_phnum == PN_XNUM) {
+ 		if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
+ 			goto end_coredump;
+ 	}
+ 
+ end_coredump:
+ 	set_fs(fs);
+ 
+ cleanup:
+ 	free_note_info(&info);
+ 	kfree(shdr4extnum);
+ 	kvfree(vma_filesz);
+ 	kfree(phdr4note);
+ 	kfree(elf);
+ out:
+ 	return has_dumped;
+ }
+ 
+ #endif		/* CONFIG_ELF_CORE */
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ /* PaX: non-PIC ELF libraries need relocations on their executable segments
+  * therefore we'll grant them VM_MAYWRITE once during their life. Similarly
+  * we'll remove VM_MAYWRITE for good on RELRO segments.
+  *
+  * The checks favour ld-linux.so behaviour which operates on a per ELF segment
+  * basis because we want to allow the common case and not the special ones.
+  */
+ static void elf_handle_mprotect(struct vm_area_struct *vma, unsigned long newflags)
+ {
+ 	struct elfhdr elf_h;
+ 	struct elf_phdr elf_p;
+ 	unsigned long i;
+ 	unsigned long oldflags;
+ 	bool is_textrel_rw, is_textrel_rx, is_relro;
+ 
+ 	if (!(vma->vm_mm->pax_flags & MF_PAX_MPROTECT) || !vma->vm_file)
+ 		return;
+ 
+ 	oldflags = vma->vm_flags & (VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ);
+ 	newflags &= VM_MAYEXEC | VM_MAYWRITE | VM_MAYREAD | VM_EXEC | VM_WRITE | VM_READ;
+ 
+ 	/* possible RELRO */
+ 	is_relro = vma->anon_vma && oldflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ) && newflags == (VM_MAYWRITE | VM_MAYREAD | VM_READ);
+ 
+ 	if (!is_textrel_rw && !is_textrel_rx && !is_relro)
+ 		return;
+ 
+ 	if (sizeof(elf_h) != kernel_read(vma->vm_file, 0UL, (char *)&elf_h, sizeof(elf_h)) ||
+ 	    memcmp(elf_h.e_ident, ELFMAG, SELFMAG) ||
+ 
+ 	    (is_relro && (elf_h.e_type != ET_DYN && elf_h.e_type != ET_EXEC)) ||
+ 	    !elf_check_arch(&elf_h) ||
+ 	    elf_h.e_phentsize != sizeof(struct elf_phdr) ||
+ 	    elf_h.e_phnum > 65536UL / sizeof(struct elf_phdr))
+ 		return;
+ 
+ 	for (i = 0UL; i < elf_h.e_phnum; i++) {
+ 		if (sizeof(elf_p) != kernel_read(vma->vm_file, elf_h.e_phoff + i*sizeof(elf_p), (char *)&elf_p, sizeof(elf_p)))
+ 			return;
+ 		switch (elf_p.p_type) {
+ 		case PT_DYNAMIC:
+ 			if (!is_textrel_rw && !is_textrel_rx)
+ 				continue;
+ 			i = 0UL;
+ 			while ((i+1) * sizeof(elf_dyn) <= elf_p.p_filesz) {
+ 				elf_dyn dyn;
+ 
+ 				if (sizeof(dyn) != kernel_read(vma->vm_file, elf_p.p_offset + i*sizeof(dyn), (char *)&dyn, sizeof(dyn)))
+ 					break;
+ 				if (dyn.d_tag == DT_NULL)
+ 					break;
+ 				if (dyn.d_tag == DT_TEXTREL || (dyn.d_tag == DT_FLAGS && (dyn.d_un.d_val & DF_TEXTREL))) {
+ 					if (is_textrel_rw)
+ 						vma->vm_flags |= VM_MAYWRITE;
+ 					else
+ 						/* PaX: disallow write access after relocs are done, hopefully noone else needs it... */
+ 						vma->vm_flags &= ~VM_MAYWRITE;
+ 					break;
+ 				}
+ 				i++;
+ 			}
+ 			is_textrel_rw = false;
+ 			is_textrel_rx = false;
+ 			continue;
+ 
+ 		case PT_GNU_RELRO:
+ 			if (!is_relro)
+ 				continue;
+ 			if ((elf_p.p_offset >> PAGE_SHIFT) == vma->vm_pgoff && ELF_PAGEALIGN(elf_p.p_memsz) == vma->vm_end - vma->vm_start)
+ 				vma->vm_flags &= ~VM_MAYWRITE;
+ 			is_relro = false;
+ 			continue;
+ 
+ #ifdef CONFIG_MINISEC_PT_PAX_FLAGS
+ 		case PT_PAX_FLAGS: {
+ 			const char *msg_mprotect = "", *msg_emutramp = "";
+ 			char *buffer_lib, *buffer_exe;
+ 
+ 			if (elf_p.p_flags & PF_NOMPROTECT)
+ 				msg_mprotect = "MPROTECT disabled";
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 			if (!(vma->vm_mm->pax_flags & MF_PAX_EMUTRAMP) && !(elf_p.p_flags & PF_NOEMUTRAMP))
+ 				msg_emutramp = "EMUTRAMP enabled";
+ #endif
+ 
+ 			if (!msg_mprotect[0] && !msg_emutramp[0])
+ 				continue;
+ 
+ 			if (!printk_ratelimit())
+ 				continue;
+ 
+ 			buffer_lib = (char *)__get_free_page(GFP_KERNEL);
+ 			buffer_exe = (char *)__get_free_page(GFP_KERNEL);
+ 			if (buffer_lib && buffer_exe) {
+ 				char *path_lib, *path_exe;
+ 
+ 				path_lib = pax_get_path(&vma->vm_file->f_path, buffer_lib, PAGE_SIZE);
+ 				path_exe = pax_get_path(&vma->vm_mm->exe_file->f_path, buffer_exe, PAGE_SIZE);
+ 
+ 				pr_info("PAX: %s wants %s%s%s on %s\n", path_lib, msg_mprotect,
+ 					(msg_mprotect[0] && msg_emutramp[0] ? " and " : ""), msg_emutramp, path_exe);
+ 
+ 			}
+ 			free_page((unsigned long)buffer_exe);
+ 			free_page((unsigned long)buffer_lib);
+ 			continue;
+ 		}
+ #endif
+ 
+ 		}
+ 	}
+ }
+ #endif
+ 
+ static int __init init_elf_binfmt(void)
+ {
+ 	register_binfmt(&elf_format);
+ 	return 0;
+ }
+ 
+ static void __exit exit_elf_binfmt(void)
+ {
+ 	/* Remove the COFF and ELF loaders. */
+ 	unregister_binfmt(&elf_format);
+ }
+ 
+ core_initcall(init_elf_binfmt);
+ module_exit(exit_elf_binfmt);
+ MODULE_LICENSE("GPL");
diff --color -rcNP Master/fs/binfmt_elf.c.rej OG/fs/binfmt_elf.c.rej
*** Master/fs/binfmt_elf.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/binfmt_elf.c.rej	2021-04-20 15:11:27.312000000 -0400
***************
*** 0 ****
--- 1,191 ----
+ *** fs/binfmt_elf.c	2021-03-13 14:01:45.000000000 +0200
+ --- fs/binfmt_elf.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 235,241 ****
+   	} while (0)
+   
+   #ifdef ARCH_DLINFO
+ ! 	/*
+   	 * ARCH_DLINFO must come first so PPC can do its special alignment of
+   	 * AUXV.
+   	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
+ --- 227,233 ----
+   	} while (0)
+   
+   #ifdef ARCH_DLINFO
+ ! 	/*
+   	 * ARCH_DLINFO must come first so PPC can do its special alignment of
+   	 * AUXV.
+   	 * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
+ ***************
+ *** 697,703 ****
+   		retval = -ENOMEM;
+   		goto out_ret;
+   	}
+ ! 
+   	/* Get the exec-header */
+   	loc->elf_ex = *((struct elfhdr *)bprm->buf);
+   
+ --- 359,365 ----
+   		retval = -ENOMEM;
+   		goto out_ret;
+   	}
+ ! 
+   	/* Get the exec-header */
+   	loc->elf_ex = *((struct elfhdr *)bprm->buf);
+   
+ ***************
+ *** 866,872 ****
+   				 executable_stack);
+   	if (retval < 0)
+   		goto out_free_dentry;
+ ! 
+   	elf_bss = 0;
+   	elf_brk = 0;
+   
+ --- 490,496 ----
+   				 executable_stack);
+   	if (retval < 0)
+   		goto out_free_dentry;
+ ! 
+   	elf_bss = 0;
+   	elf_brk = 0;
+   
+ ***************
+ *** 888,894 ****
+   
+   		if (unlikely (elf_brk > elf_bss)) {
+   			unsigned long nbyte;
+ ! 
+   			/* There was a PT_LOAD segment with p_memsz > p_filesz
+   			   before this one. Map anonymous pages, if needed,
+   			   and clear the area.  */
+ --- 512,518 ----
+   
+   		if (unlikely (elf_brk > elf_bss)) {
+   			unsigned long nbyte;
+ ! 
+   			/* There was a PT_LOAD segment with p_memsz > p_filesz
+   			   before this one. Map anonymous pages, if needed,
+   			   and clear the area.  */
+ ***************
+ *** 1456,1462 ****
+   	phdr->p_align = 0;
+   }
+   
+ ! static void fill_note(struct memelfnote *note, const char *name, int type,
+   		unsigned int sz, void *data)
+   {
+   	note->name = name;
+ --- 1080,1086 ----
+   	phdr->p_align = 0;
+   }
+   
+ ! static void fill_note(struct memelfnote *note, const char *name, int type,
+   		unsigned int sz, void *data)
+   {
+   	note->name = name;
+ ***************
+ *** 1508,1514 ****
+   {
+   	const struct cred *cred;
+   	unsigned int i, len;
+ ! 
+   	/* first copy the parameters from user space */
+   	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+   
+ --- 1132,1138 ----
+   {
+   	const struct cred *cred;
+   	unsigned int i, len;
+ ! 
+   	/* first copy the parameters from user space */
+   	memset(psinfo, 0, sizeof(struct elf_prpsinfo));
+   
+ ***************
+ *** 1542,1548 ****
+   	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
+   	rcu_read_unlock();
+   	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+ ! 
+   	return 0;
+   }
+   
+ --- 1166,1172 ----
+   	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
+   	rcu_read_unlock();
+   	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+ ! 
+   	return 0;
+   }
+   
+ ***************
+ *** 1939,1946 ****
+   	t->num_notes = 0;
+   
+   	fill_prstatus(&t->prstatus, p, signr);
+ ! 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+ ! 
+   	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+   		  &(t->prstatus));
+   	t->num_notes++;
+ --- 1563,1570 ----
+   	t->num_notes = 0;
+   
+   	fill_prstatus(&t->prstatus, p, signr);
+ ! 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+ ! 
+   	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+   		  &(t->prstatus));
+   	t->num_notes++;
+ ***************
+ *** 1961,1967 ****
+   		t->num_notes++;
+   		sz += notesize(&t->notes[2]);
+   	}
+ ! #endif
+   	return sz;
+   }
+   
+ --- 1585,1591 ----
+   		t->num_notes++;
+   		sz += notesize(&t->notes[2]);
+   	}
+ ! #endif
+   	return sz;
+   }
+   
+ ***************
+ *** 2199,2205 ****
+   
+   	/*
+   	 * We no longer stop all VM operations.
+ ! 	 *
+   	 * This is because those proceses that could possibly change map_count
+   	 * or the mmap / vma pages are now blocked in do_exit on current
+   	 * finishing this core dump.
+ --- 1823,1829 ----
+   
+   	/*
+   	 * We no longer stop all VM operations.
+ ! 	 *
+   	 * This is because those proceses that could possibly change map_count
+   	 * or the mmap / vma pages are now blocked in do_exit on current
+   	 * finishing this core dump.
+ ***************
+ *** 2208,2214 ****
+   	 * the map_count or the pages allocated. So no possibility of crashing
+   	 * exists while dumping the mm->vm_next areas to the core file.
+   	 */
+ ! 
+   	/* alloc memory for large data structures: too large to be on stack */
+   	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+   	if (!elf)
+ --- 1832,1838 ----
+   	 * the map_count or the pages allocated. So no possibility of crashing
+   	 * exists while dumping the mm->vm_next areas to the core file.
+   	 */
+ ! 
+   	/* alloc memory for large data structures: too large to be on stack */
+   	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
+   	if (!elf)
diff --color -rcNP Master/fs/dcache.c OG/fs/dcache.c
*** Master/fs/dcache.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/dcache.c	2021-04-20 15:11:34.506000000 -0400
***************
*** 268,274 ****
  {
  	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
  
! 	kmem_cache_free(dentry_cache, dentry); 
  }
  
  static void __d_free_external(struct rcu_head *head)
--- 268,274 ----
  {
  	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
  
! 	kmem_cache_free(dentry_cache, dentry);
  }
  
  static void __d_free_external(struct rcu_head *head)
***************
*** 811,817 ****
  }
  
  
! /* 
   * This is dput
   *
   * This is complicated by the fact that we do not want to put
--- 811,817 ----
  }
  
  
! /*
   * This is dput
   *
   * This is complicated by the fact that we do not want to put
***************
*** 830,836 ****
  
  /*
   * dput - release a dentry
!  * @dentry: dentry to release 
   *
   * Release a dentry. This will drop the usage count and if appropriate
   * call the dentry unlink method as well as removing it from the queues and
--- 830,836 ----
  
  /*
   * dput - release a dentry
!  * @dentry: dentry to release
   *
   * Release a dentry. This will drop the usage count and if appropriate
   * call the dentry unlink method as well as removing it from the queues and
***************
*** 1680,1686 ****
   * available. On a success the dentry is returned. The name passed in is
   * copied and the copy passed in may be reused after this call.
   */
!  
  struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
  {
  	struct dentry *dentry;
--- 1680,1686 ----
   * available. On a success the dentry is returned. The name passed in is
   * copied and the copy passed in may be reused after this call.
   */
! 
  struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
  {
  	struct dentry *dentry;
***************
*** 1707,1720 ****
  						  GFP_KERNEL_ACCOUNT |
  						  __GFP_RECLAIMABLE);
  		if (!p) {
! 			kmem_cache_free(dentry_cache, dentry); 
  			return NULL;
  		}
  		atomic_set(&p->u.count, 1);
  		dname = p->name;
  	} else  {
  		dname = dentry->d_iname;
! 	}	
  
  	dentry->d_name.len = name->len;
  	dentry->d_name.hash = name->hash;
--- 1707,1720 ----
  						  GFP_KERNEL_ACCOUNT |
  						  __GFP_RECLAIMABLE);
  		if (!p) {
! 			kmem_cache_free(dentry_cache, dentry);
  			return NULL;
  		}
  		atomic_set(&p->u.count, 1);
  		dname = p->name;
  	} else  {
  		dname = dentry->d_iname;
! 	}
  
  	dentry->d_name.len = name->len;
  	dentry->d_name.hash = name->hash;
***************
*** 1733,1738 ****
--- 1733,1741 ----
  	dentry->d_sb = sb;
  	dentry->d_op = NULL;
  	dentry->d_fsdata = NULL;
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	atomic_set(&dentry->chroot_refcnt, 0);
+ #endif
  	INIT_HLIST_BL_NODE(&dentry->d_hash);
  	INIT_LIST_HEAD(&dentry->d_lru);
  	INIT_LIST_HEAD(&dentry->d_subdirs);
***************
*** 1946,1952 ****
   * (or otherwise set) by the caller to indicate that it is now
   * in use by the dcache.
   */
!  
  void d_instantiate(struct dentry *entry, struct inode * inode)
  {
  	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
--- 1949,1955 ----
   * (or otherwise set) by the caller to indicate that it is now
   * in use by the dcache.
   */
! 
  void d_instantiate(struct dentry *entry, struct inode * inode)
  {
  	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
***************
*** 2156,2162 ****
  		if (!found) {
  			iput(inode);
  			return ERR_PTR(-ENOMEM);
! 		} 
  	}
  	res = d_splice_alias(inode, found);
  	if (res) {
--- 2159,2165 ----
  		if (!found) {
  			iput(inode);
  			return ERR_PTR(-ENOMEM);
! 		}
  	}
  	res = d_splice_alias(inode, found);
  	if (res) {
***************
*** 2365,2371 ****
  	 * See Documentation/filesystems/path-lookup.txt for more details.
  	 */
  	rcu_read_lock();
! 	
  	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
  
  		if (dentry->d_name.hash != hash)
--- 2368,2374 ----
  	 * See Documentation/filesystems/path-lookup.txt for more details.
  	 */
  	rcu_read_lock();
! 
  	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
  
  		if (dentry->d_name.hash != hash)
***************
*** 2428,2434 ****
   * it from the hash queues and waiting for
   * it to be deleted later when it has no users
   */
!  
  /**
   * d_delete - delete a dentry
   * @dentry: The dentry to delete
--- 2431,2437 ----
   * it from the hash queues and waiting for
   * it to be deleted later when it has no users
   */
! 
  /**
   * d_delete - delete a dentry
   * @dentry: The dentry to delete
***************
*** 2436,2442 ****
   * Turn the dentry into a negative dentry if possible, otherwise
   * remove it from the hash queues so it can be deleted later
   */
!  
  void d_delete(struct dentry * dentry)
  {
  	struct inode *inode = dentry->d_inode;
--- 2439,2445 ----
   * Turn the dentry into a negative dentry if possible, otherwise
   * remove it from the hash queues so it can be deleted later
   */
! 
  void d_delete(struct dentry * dentry)
  {
  	struct inode *inode = dentry->d_inode;
***************
*** 2472,2478 ****
   *
   * Adds a dentry to the hash according to its name.
   */
!  
  void d_rehash(struct dentry * entry)
  {
  	spin_lock(&entry->d_lock);
--- 2475,2481 ----
   *
   * Adds a dentry to the hash according to its name.
   */
! 
  void d_rehash(struct dentry * entry)
  {
  	spin_lock(&entry->d_lock);
***************
*** 3063,3069 ****
   * Returns false otherwise.
   * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
   */
!   
  bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
  {
  	bool result;
--- 3066,3072 ----
   * Returns false otherwise.
   * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
   */
! 
  bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
  {
  	bool result;
diff --color -rcNP Master/fs/dcache.c.orig OG/fs/dcache.c.orig
*** Master/fs/dcache.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/dcache.c.orig	2021-04-20 15:10:45.379000000 -0400
***************
*** 0 ****
--- 1,3221 ----
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  * fs/dcache.c
+  *
+  * Complete reimplementation
+  * (C) 1997 Thomas Schoebel-Theuer,
+  * with heavy changes by Linus Torvalds
+  */
+ 
+ /*
+  * Notes on the allocation strategy:
+  *
+  * The dcache is a master of the icache - whenever a dcache entry
+  * exists, the inode will always exist. "iput()" is done either when
+  * the dcache entry is deleted or garbage collected.
+  */
+ 
+ #include <linux/ratelimit.h>
+ #include <linux/string.h>
+ #include <linux/mm.h>
+ #include <linux/fs.h>
+ #include <linux/fscrypt.h>
+ #include <linux/fsnotify.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/hash.h>
+ #include <linux/cache.h>
+ #include <linux/export.h>
+ #include <linux/security.h>
+ #include <linux/seqlock.h>
+ #include <linux/memblock.h>
+ #include <linux/bit_spinlock.h>
+ #include <linux/rculist_bl.h>
+ #include <linux/list_lru.h>
+ #include "internal.h"
+ #include "mount.h"
+ 
+ /*
+  * Usage:
+  * dcache->d_inode->i_lock protects:
+  *   - i_dentry, d_u.d_alias, d_inode of aliases
+  * dcache_hash_bucket lock protects:
+  *   - the dcache hash table
+  * s_roots bl list spinlock protects:
+  *   - the s_roots list (see __d_drop)
+  * dentry->d_sb->s_dentry_lru_lock protects:
+  *   - the dcache lru lists and counters
+  * d_lock protects:
+  *   - d_flags
+  *   - d_name
+  *   - d_lru
+  *   - d_count
+  *   - d_unhashed()
+  *   - d_parent and d_subdirs
+  *   - childrens' d_child and d_parent
+  *   - d_u.d_alias, d_inode
+  *
+  * Ordering:
+  * dentry->d_inode->i_lock
+  *   dentry->d_lock
+  *     dentry->d_sb->s_dentry_lru_lock
+  *     dcache_hash_bucket lock
+  *     s_roots lock
+  *
+  * If there is an ancestor relationship:
+  * dentry->d_parent->...->d_parent->d_lock
+  *   ...
+  *     dentry->d_parent->d_lock
+  *       dentry->d_lock
+  *
+  * If no ancestor relationship:
+  * arbitrary, since it's serialized on rename_lock
+  */
+ int sysctl_vfs_cache_pressure __read_mostly = 100;
+ EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+ 
+ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
+ 
+ EXPORT_SYMBOL(rename_lock);
+ 
+ static struct kmem_cache *dentry_cache __read_mostly;
+ 
+ const struct qstr empty_name = QSTR_INIT("", 0);
+ EXPORT_SYMBOL(empty_name);
+ const struct qstr slash_name = QSTR_INIT("/", 1);
+ EXPORT_SYMBOL(slash_name);
+ 
+ /*
+  * This is the single most critical data structure when it comes
+  * to the dcache: the hashtable for lookups. Somebody should try
+  * to make this good - I've just made it work.
+  *
+  * This hash-function tries to avoid losing too many bits of hash
+  * information, yet avoid using a prime hash-size or similar.
+  */
+ 
+ static unsigned int d_hash_shift __read_mostly;
+ 
+ static struct hlist_bl_head *dentry_hashtable __read_mostly;
+ 
+ static inline struct hlist_bl_head *d_hash(unsigned int hash)
+ {
+ 	return dentry_hashtable + (hash >> d_hash_shift);
+ }
+ 
+ #define IN_LOOKUP_SHIFT 10
+ static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];
+ 
+ static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
+ 					unsigned int hash)
+ {
+ 	hash += (unsigned long) parent / L1_CACHE_BYTES;
+ 	return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
+ }
+ 
+ 
+ /* Statistics gathering. */
+ struct dentry_stat_t dentry_stat = {
+ 	.age_limit = 45,
+ };
+ 
+ static DEFINE_PER_CPU(long, nr_dentry);
+ static DEFINE_PER_CPU(long, nr_dentry_unused);
+ static DEFINE_PER_CPU(long, nr_dentry_negative);
+ 
+ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+ 
+ /*
+  * Here we resort to our own counters instead of using generic per-cpu counters
+  * for consistency with what the vfs inode code does. We are expected to harvest
+  * better code and performance by having our own specialized counters.
+  *
+  * Please note that the loop is done over all possible CPUs, not over all online
+  * CPUs. The reason for this is that we don't want to play games with CPUs going
+  * on and off. If one of them goes off, we will just keep their counters.
+  *
+  * glommer: See cffbc8a for details, and if you ever intend to change this,
+  * please update all vfs counters to match.
+  */
+ static long get_nr_dentry(void)
+ {
+ 	int i;
+ 	long sum = 0;
+ 	for_each_possible_cpu(i)
+ 		sum += per_cpu(nr_dentry, i);
+ 	return sum < 0 ? 0 : sum;
+ }
+ 
+ static long get_nr_dentry_unused(void)
+ {
+ 	int i;
+ 	long sum = 0;
+ 	for_each_possible_cpu(i)
+ 		sum += per_cpu(nr_dentry_unused, i);
+ 	return sum < 0 ? 0 : sum;
+ }
+ 
+ static long get_nr_dentry_negative(void)
+ {
+ 	int i;
+ 	long sum = 0;
+ 
+ 	for_each_possible_cpu(i)
+ 		sum += per_cpu(nr_dentry_negative, i);
+ 	return sum < 0 ? 0 : sum;
+ }
+ 
+ int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
+ 		   size_t *lenp, loff_t *ppos)
+ {
+ 	dentry_stat.nr_dentry = get_nr_dentry();
+ 	dentry_stat.nr_unused = get_nr_dentry_unused();
+ 	dentry_stat.nr_negative = get_nr_dentry_negative();
+ 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+ }
+ #endif
+ 
+ /*
+  * Compare 2 name strings, return 0 if they match, otherwise non-zero.
+  * The strings are both count bytes long, and count is non-zero.
+  */
+ #ifdef CONFIG_DCACHE_WORD_ACCESS
+ 
+ #include <asm/word-at-a-time.h>
+ /*
+  * NOTE! 'cs' and 'scount' come from a dentry, so it has a
+  * aligned allocation for this particular component. We don't
+  * strictly need the load_unaligned_zeropad() safety, but it
+  * doesn't hurt either.
+  *
+  * In contrast, 'ct' and 'tcount' can be from a pathname, and do
+  * need the careful unaligned handling.
+  */
+ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
+ {
+ 	unsigned long a,b,mask;
+ 
+ 	for (;;) {
+ 		a = read_word_at_a_time(cs);
+ 		b = load_unaligned_zeropad(ct);
+ 		if (tcount < sizeof(unsigned long))
+ 			break;
+ 		if (unlikely(a != b))
+ 			return 1;
+ 		cs += sizeof(unsigned long);
+ 		ct += sizeof(unsigned long);
+ 		tcount -= sizeof(unsigned long);
+ 		if (!tcount)
+ 			return 0;
+ 	}
+ 	mask = bytemask_from_count(tcount);
+ 	return unlikely(!!((a ^ b) & mask));
+ }
+ 
+ #else
+ 
+ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
+ {
+ 	do {
+ 		if (*cs != *ct)
+ 			return 1;
+ 		cs++;
+ 		ct++;
+ 		tcount--;
+ 	} while (tcount);
+ 	return 0;
+ }
+ 
+ #endif
+ 
+ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
+ {
+ 	/*
+ 	 * Be careful about RCU walk racing with rename:
+ 	 * use 'READ_ONCE' to fetch the name pointer.
+ 	 *
+ 	 * NOTE! Even if a rename will mean that the length
+ 	 * was not loaded atomically, we don't care. The
+ 	 * RCU walk will check the sequence count eventually,
+ 	 * and catch it. And we won't overrun the buffer,
+ 	 * because we're reading the name pointer atomically,
+ 	 * and a dentry name is guaranteed to be properly
+ 	 * terminated with a NUL byte.
+ 	 *
+ 	 * End result: even if 'len' is wrong, we'll exit
+ 	 * early because the data cannot match (there can
+ 	 * be no NUL in the ct/tcount data)
+ 	 */
+ 	const unsigned char *cs = READ_ONCE(dentry->d_name.name);
+ 
+ 	return dentry_string_cmp(cs, ct, tcount);
+ }
+ 
+ struct external_name {
+ 	union {
+ 		atomic_t count;
+ 		struct rcu_head head;
+ 	} u;
+ 	unsigned char name[];
+ };
+ 
+ static inline struct external_name *external_name(struct dentry *dentry)
+ {
+ 	return container_of(dentry->d_name.name, struct external_name, name[0]);
+ }
+ 
+ static void __d_free(struct rcu_head *head)
+ {
+ 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+ 
+ 	kmem_cache_free(dentry_cache, dentry); 
+ }
+ 
+ static void __d_free_external(struct rcu_head *head)
+ {
+ 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+ 	kfree(external_name(dentry));
+ 	kmem_cache_free(dentry_cache, dentry);
+ }
+ 
+ static inline int dname_external(const struct dentry *dentry)
+ {
+ 	return dentry->d_name.name != dentry->d_iname;
+ }
+ 
+ void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
+ {
+ 	spin_lock(&dentry->d_lock);
+ 	name->name = dentry->d_name;
+ 	if (unlikely(dname_external(dentry))) {
+ 		atomic_inc(&external_name(dentry)->u.count);
+ 	} else {
+ 		memcpy(name->inline_name, dentry->d_iname,
+ 		       dentry->d_name.len + 1);
+ 		name->name.name = name->inline_name;
+ 	}
+ 	spin_unlock(&dentry->d_lock);
+ }
+ EXPORT_SYMBOL(take_dentry_name_snapshot);
+ 
+ void release_dentry_name_snapshot(struct name_snapshot *name)
+ {
+ 	if (unlikely(name->name.name != name->inline_name)) {
+ 		struct external_name *p;
+ 		p = container_of(name->name.name, struct external_name, name[0]);
+ 		if (unlikely(atomic_dec_and_test(&p->u.count)))
+ 			kfree_rcu(p, u.head);
+ 	}
+ }
+ EXPORT_SYMBOL(release_dentry_name_snapshot);
+ 
+ static inline void __d_set_inode_and_type(struct dentry *dentry,
+ 					  struct inode *inode,
+ 					  unsigned type_flags)
+ {
+ 	unsigned flags;
+ 
+ 	dentry->d_inode = inode;
+ 	flags = READ_ONCE(dentry->d_flags);
+ 	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ 	flags |= type_flags;
+ 	WRITE_ONCE(dentry->d_flags, flags);
+ }
+ 
+ static inline void __d_clear_type_and_inode(struct dentry *dentry)
+ {
+ 	unsigned flags = READ_ONCE(dentry->d_flags);
+ 
+ 	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ 	WRITE_ONCE(dentry->d_flags, flags);
+ 	dentry->d_inode = NULL;
+ 	if (dentry->d_flags & DCACHE_LRU_LIST)
+ 		this_cpu_inc(nr_dentry_negative);
+ }
+ 
+ static void dentry_free(struct dentry *dentry)
+ {
+ 	WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
+ 	if (unlikely(dname_external(dentry))) {
+ 		struct external_name *p = external_name(dentry);
+ 		if (likely(atomic_dec_and_test(&p->u.count))) {
+ 			call_rcu(&dentry->d_u.d_rcu, __d_free_external);
+ 			return;
+ 		}
+ 	}
+ 	/* if dentry was never visible to RCU, immediate free is OK */
+ 	if (dentry->d_flags & DCACHE_NORCU)
+ 		__d_free(&dentry->d_u.d_rcu);
+ 	else
+ 		call_rcu(&dentry->d_u.d_rcu, __d_free);
+ }
+ 
+ /*
+  * Release the dentry's inode, using the filesystem
+  * d_iput() operation if defined.
+  */
+ static void dentry_unlink_inode(struct dentry * dentry)
+ 	__releases(dentry->d_lock)
+ 	__releases(dentry->d_inode->i_lock)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 
+ 	raw_write_seqcount_begin(&dentry->d_seq);
+ 	__d_clear_type_and_inode(dentry);
+ 	hlist_del_init(&dentry->d_u.d_alias);
+ 	raw_write_seqcount_end(&dentry->d_seq);
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_unlock(&inode->i_lock);
+ 	if (!inode->i_nlink)
+ 		fsnotify_inoderemove(inode);
+ 	if (dentry->d_op && dentry->d_op->d_iput)
+ 		dentry->d_op->d_iput(dentry, inode);
+ 	else
+ 		iput(inode);
+ }
+ 
+ /*
+  * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+  * is in use - which includes both the "real" per-superblock
+  * LRU list _and_ the DCACHE_SHRINK_LIST use.
+  *
+  * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+  * on the shrink list (ie not on the superblock LRU list).
+  *
+  * The per-cpu "nr_dentry_unused" counters are updated with
+  * the DCACHE_LRU_LIST bit.
+  *
+  * The per-cpu "nr_dentry_negative" counters are only updated
+  * when deleted from or added to the per-superblock LRU list, not
+  * from/to the shrink list. That is to avoid an unneeded dec/inc
+  * pair when moving from LRU to shrink list in select_collect().
+  *
+  * These helper functions make sure we always follow the
+  * rules. d_lock must be held by the caller.
+  */
+ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+ static void d_lru_add(struct dentry *dentry)
+ {
+ 	D_FLAG_VERIFY(dentry, 0);
+ 	dentry->d_flags |= DCACHE_LRU_LIST;
+ 	this_cpu_inc(nr_dentry_unused);
+ 	if (d_is_negative(dentry))
+ 		this_cpu_inc(nr_dentry_negative);
+ 	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+ }
+ 
+ static void d_lru_del(struct dentry *dentry)
+ {
+ 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+ 	dentry->d_flags &= ~DCACHE_LRU_LIST;
+ 	this_cpu_dec(nr_dentry_unused);
+ 	if (d_is_negative(dentry))
+ 		this_cpu_dec(nr_dentry_negative);
+ 	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+ }
+ 
+ static void d_shrink_del(struct dentry *dentry)
+ {
+ 	D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
+ 	list_del_init(&dentry->d_lru);
+ 	dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
+ 	this_cpu_dec(nr_dentry_unused);
+ }
+ 
+ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+ {
+ 	D_FLAG_VERIFY(dentry, 0);
+ 	list_add(&dentry->d_lru, list);
+ 	dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+ 	this_cpu_inc(nr_dentry_unused);
+ }
+ 
+ /*
+  * These can only be called under the global LRU lock, ie during the
+  * callback for freeing the LRU list. "isolate" removes it from the
+  * LRU lists entirely, while shrink_move moves it to the indicated
+  * private list.
+  */
+ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
+ {
+ 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+ 	dentry->d_flags &= ~DCACHE_LRU_LIST;
+ 	this_cpu_dec(nr_dentry_unused);
+ 	if (d_is_negative(dentry))
+ 		this_cpu_dec(nr_dentry_negative);
+ 	list_lru_isolate(lru, &dentry->d_lru);
+ }
+ 
+ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+ 			      struct list_head *list)
+ {
+ 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+ 	dentry->d_flags |= DCACHE_SHRINK_LIST;
+ 	if (d_is_negative(dentry))
+ 		this_cpu_dec(nr_dentry_negative);
+ 	list_lru_isolate_move(lru, &dentry->d_lru, list);
+ }
+ 
+ /**
+  * d_drop - drop a dentry
+  * @dentry: dentry to drop
+  *
+  * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
+  * be found through a VFS lookup any more. Note that this is different from
+  * deleting the dentry - d_delete will try to mark the dentry negative if
+  * possible, giving a successful _negative_ lookup, while d_drop will
+  * just make the cache lookup fail.
+  *
+  * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
+  * reason (NFS timeouts or autofs deletes).
+  *
+  * __d_drop requires dentry->d_lock
+  * ___d_drop doesn't mark dentry as "unhashed"
+  *   (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
+  */
+ static void ___d_drop(struct dentry *dentry)
+ {
+ 	struct hlist_bl_head *b;
+ 	/*
+ 	 * Hashed dentries are normally on the dentry hashtable,
+ 	 * with the exception of those newly allocated by
+ 	 * d_obtain_root, which are always IS_ROOT:
+ 	 */
+ 	if (unlikely(IS_ROOT(dentry)))
+ 		b = &dentry->d_sb->s_roots;
+ 	else
+ 		b = d_hash(dentry->d_name.hash);
+ 
+ 	hlist_bl_lock(b);
+ 	__hlist_bl_del(&dentry->d_hash);
+ 	hlist_bl_unlock(b);
+ }
+ 
+ void __d_drop(struct dentry *dentry)
+ {
+ 	if (!d_unhashed(dentry)) {
+ 		___d_drop(dentry);
+ 		dentry->d_hash.pprev = NULL;
+ 		write_seqcount_invalidate(&dentry->d_seq);
+ 	}
+ }
+ EXPORT_SYMBOL(__d_drop);
+ 
+ void d_drop(struct dentry *dentry)
+ {
+ 	spin_lock(&dentry->d_lock);
+ 	__d_drop(dentry);
+ 	spin_unlock(&dentry->d_lock);
+ }
+ EXPORT_SYMBOL(d_drop);
+ 
+ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+ {
+ 	struct dentry *next;
+ 	/*
+ 	 * Inform d_walk() and shrink_dentry_list() that we are no longer
+ 	 * attached to the dentry tree
+ 	 */
+ 	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ 	if (unlikely(list_empty(&dentry->d_child)))
+ 		return;
+ 	__list_del_entry(&dentry->d_child);
+ 	/*
+ 	 * Cursors can move around the list of children.  While we'd been
+ 	 * a normal list member, it didn't matter - ->d_child.next would've
+ 	 * been updated.  However, from now on it won't be and for the
+ 	 * things like d_walk() it might end up with a nasty surprise.
+ 	 * Normally d_walk() doesn't care about cursors moving around -
+ 	 * ->d_lock on parent prevents that and since a cursor has no children
+ 	 * of its own, we get through it without ever unlocking the parent.
+ 	 * There is one exception, though - if we ascend from a child that
+ 	 * gets killed as soon as we unlock it, the next sibling is found
+ 	 * using the value left in its ->d_child.next.  And if _that_
+ 	 * pointed to a cursor, and cursor got moved (e.g. by lseek())
+ 	 * before d_walk() regains parent->d_lock, we'll end up skipping
+ 	 * everything the cursor had been moved past.
+ 	 *
+ 	 * Solution: make sure that the pointer left behind in ->d_child.next
+ 	 * points to something that won't be moving around.  I.e. skip the
+ 	 * cursors.
+ 	 */
+ 	while (dentry->d_child.next != &parent->d_subdirs) {
+ 		next = list_entry(dentry->d_child.next, struct dentry, d_child);
+ 		if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
+ 			break;
+ 		dentry->d_child.next = next->d_child.next;
+ 	}
+ }
+ 
+ static void __dentry_kill(struct dentry *dentry)
+ {
+ 	struct dentry *parent = NULL;
+ 	bool can_free = true;
+ 	if (!IS_ROOT(dentry))
+ 		parent = dentry->d_parent;
+ 
+ 	/*
+ 	 * The dentry is now unrecoverably dead to the world.
+ 	 */
+ 	lockref_mark_dead(&dentry->d_lockref);
+ 
+ 	/*
+ 	 * inform the fs via d_prune that this dentry is about to be
+ 	 * unhashed and destroyed.
+ 	 */
+ 	if (dentry->d_flags & DCACHE_OP_PRUNE)
+ 		dentry->d_op->d_prune(dentry);
+ 
+ 	if (dentry->d_flags & DCACHE_LRU_LIST) {
+ 		if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+ 			d_lru_del(dentry);
+ 	}
+ 	/* if it was on the hash then remove it */
+ 	__d_drop(dentry);
+ 	dentry_unlist(dentry, parent);
+ 	if (parent)
+ 		spin_unlock(&parent->d_lock);
+ 	if (dentry->d_inode)
+ 		dentry_unlink_inode(dentry);
+ 	else
+ 		spin_unlock(&dentry->d_lock);
+ 	this_cpu_dec(nr_dentry);
+ 	if (dentry->d_op && dentry->d_op->d_release)
+ 		dentry->d_op->d_release(dentry);
+ 
+ 	spin_lock(&dentry->d_lock);
+ 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ 		dentry->d_flags |= DCACHE_MAY_FREE;
+ 		can_free = false;
+ 	}
+ 	spin_unlock(&dentry->d_lock);
+ 	if (likely(can_free))
+ 		dentry_free(dentry);
+ 	cond_resched();
+ }
+ 
+ static struct dentry *__lock_parent(struct dentry *dentry)
+ {
+ 	struct dentry *parent;
+ 	rcu_read_lock();
+ 	spin_unlock(&dentry->d_lock);
+ again:
+ 	parent = READ_ONCE(dentry->d_parent);
+ 	spin_lock(&parent->d_lock);
+ 	/*
+ 	 * We can't blindly lock dentry until we are sure
+ 	 * that we won't violate the locking order.
+ 	 * Any changes of dentry->d_parent must have
+ 	 * been done with parent->d_lock held, so
+ 	 * spin_lock() above is enough of a barrier
+ 	 * for checking if it's still our child.
+ 	 */
+ 	if (unlikely(parent != dentry->d_parent)) {
+ 		spin_unlock(&parent->d_lock);
+ 		goto again;
+ 	}
+ 	rcu_read_unlock();
+ 	if (parent != dentry)
+ 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ 	else
+ 		parent = NULL;
+ 	return parent;
+ }
+ 
+ static inline struct dentry *lock_parent(struct dentry *dentry)
+ {
+ 	struct dentry *parent = dentry->d_parent;
+ 	if (IS_ROOT(dentry))
+ 		return NULL;
+ 	if (likely(spin_trylock(&parent->d_lock)))
+ 		return parent;
+ 	return __lock_parent(dentry);
+ }
+ 
+ static inline bool retain_dentry(struct dentry *dentry)
+ {
+ 	WARN_ON(d_in_lookup(dentry));
+ 
+ 	/* Unreachable? Get rid of it */
+ 	if (unlikely(d_unhashed(dentry)))
+ 		return false;
+ 
+ 	if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+ 		return false;
+ 
+ 	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
+ 		if (dentry->d_op->d_delete(dentry))
+ 			return false;
+ 	}
+ 	/* retain; LRU fodder */
+ 	dentry->d_lockref.count--;
+ 	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+ 		d_lru_add(dentry);
+ 	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ 		dentry->d_flags |= DCACHE_REFERENCED;
+ 	return true;
+ }
+ 
+ /*
+  * Finish off a dentry we've decided to kill.
+  * dentry->d_lock must be held, returns with it unlocked.
+  * Returns dentry requiring refcount drop, or NULL if we're done.
+  */
+ static struct dentry *dentry_kill(struct dentry *dentry)
+ 	__releases(dentry->d_lock)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 	struct dentry *parent = NULL;
+ 
+ 	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+ 		goto slow_positive;
+ 
+ 	if (!IS_ROOT(dentry)) {
+ 		parent = dentry->d_parent;
+ 		if (unlikely(!spin_trylock(&parent->d_lock))) {
+ 			parent = __lock_parent(dentry);
+ 			if (likely(inode || !dentry->d_inode))
+ 				goto got_locks;
+ 			/* negative that became positive */
+ 			if (parent)
+ 				spin_unlock(&parent->d_lock);
+ 			inode = dentry->d_inode;
+ 			goto slow_positive;
+ 		}
+ 	}
+ 	__dentry_kill(dentry);
+ 	return parent;
+ 
+ slow_positive:
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_lock(&inode->i_lock);
+ 	spin_lock(&dentry->d_lock);
+ 	parent = lock_parent(dentry);
+ got_locks:
+ 	if (unlikely(dentry->d_lockref.count != 1)) {
+ 		dentry->d_lockref.count--;
+ 	} else if (likely(!retain_dentry(dentry))) {
+ 		__dentry_kill(dentry);
+ 		return parent;
+ 	}
+ 	/* we are keeping it, after all */
+ 	if (inode)
+ 		spin_unlock(&inode->i_lock);
+ 	if (parent)
+ 		spin_unlock(&parent->d_lock);
+ 	spin_unlock(&dentry->d_lock);
+ 	return NULL;
+ }
+ 
+ /*
+  * Try to do a lockless dput(), and return whether that was successful.
+  *
+  * If unsuccessful, we return false, having already taken the dentry lock.
+  *
+  * The caller needs to hold the RCU read lock, so that the dentry is
+  * guaranteed to stay around even if the refcount goes down to zero!
+  */
+ static inline bool fast_dput(struct dentry *dentry)
+ {
+ 	int ret;
+ 	unsigned int d_flags;
+ 
+ 	/*
+ 	 * If we have a d_op->d_delete() operation, we sould not
+ 	 * let the dentry count go to zero, so use "put_or_lock".
+ 	 */
+ 	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+ 		return lockref_put_or_lock(&dentry->d_lockref);
+ 
+ 	/*
+ 	 * .. otherwise, we can try to just decrement the
+ 	 * lockref optimistically.
+ 	 */
+ 	ret = lockref_put_return(&dentry->d_lockref);
+ 
+ 	/*
+ 	 * If the lockref_put_return() failed due to the lock being held
+ 	 * by somebody else, the fast path has failed. We will need to
+ 	 * get the lock, and then check the count again.
+ 	 */
+ 	if (unlikely(ret < 0)) {
+ 		spin_lock(&dentry->d_lock);
+ 		if (dentry->d_lockref.count > 1) {
+ 			dentry->d_lockref.count--;
+ 			spin_unlock(&dentry->d_lock);
+ 			return true;
+ 		}
+ 		return false;
+ 	}
+ 
+ 	/*
+ 	 * If we weren't the last ref, we're done.
+ 	 */
+ 	if (ret)
+ 		return true;
+ 
+ 	/*
+ 	 * Careful, careful. The reference count went down
+ 	 * to zero, but we don't hold the dentry lock, so
+ 	 * somebody else could get it again, and do another
+ 	 * dput(), and we need to not race with that.
+ 	 *
+ 	 * However, there is a very special and common case
+ 	 * where we don't care, because there is nothing to
+ 	 * do: the dentry is still hashed, it does not have
+ 	 * a 'delete' op, and it's referenced and already on
+ 	 * the LRU list.
+ 	 *
+ 	 * NOTE! Since we aren't locked, these values are
+ 	 * not "stable". However, it is sufficient that at
+ 	 * some point after we dropped the reference the
+ 	 * dentry was hashed and the flags had the proper
+ 	 * value. Other dentry users may have re-gotten
+ 	 * a reference to the dentry and change that, but
+ 	 * our work is done - we can leave the dentry
+ 	 * around with a zero refcount.
+ 	 */
+ 	smp_rmb();
+ 	d_flags = READ_ONCE(dentry->d_flags);
+ 	d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
+ 
+ 	/* Nothing to do? Dropping the reference was all we needed? */
+ 	if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+ 		return true;
+ 
+ 	/*
+ 	 * Not the fast normal case? Get the lock. We've already decremented
+ 	 * the refcount, but we'll need to re-check the situation after
+ 	 * getting the lock.
+ 	 */
+ 	spin_lock(&dentry->d_lock);
+ 
+ 	/*
+ 	 * Did somebody else grab a reference to it in the meantime, and
+ 	 * we're no longer the last user after all? Alternatively, somebody
+ 	 * else could have killed it and marked it dead. Either way, we
+ 	 * don't need to do anything else.
+ 	 */
+ 	if (dentry->d_lockref.count) {
+ 		spin_unlock(&dentry->d_lock);
+ 		return true;
+ 	}
+ 
+ 	/*
+ 	 * Re-get the reference we optimistically dropped. We hold the
+ 	 * lock, and we just tested that it was zero, so we can just
+ 	 * set it to 1.
+ 	 */
+ 	dentry->d_lockref.count = 1;
+ 	return false;
+ }
+ 
+ 
+ /* 
+  * This is dput
+  *
+  * This is complicated by the fact that we do not want to put
+  * dentries that are no longer on any hash chain on the unused
+  * list: we'd much rather just get rid of them immediately.
+  *
+  * However, that implies that we have to traverse the dentry
+  * tree upwards to the parents which might _also_ now be
+  * scheduled for deletion (it may have been only waiting for
+  * its last child to go away).
+  *
+  * This tail recursion is done by hand as we don't want to depend
+  * on the compiler to always get this right (gcc generally doesn't).
+  * Real recursion would eat up our stack space.
+  */
+ 
+ /*
+  * dput - release a dentry
+  * @dentry: dentry to release 
+  *
+  * Release a dentry. This will drop the usage count and if appropriate
+  * call the dentry unlink method as well as removing it from the queues and
+  * releasing its resources. If the parent dentries were scheduled for release
+  * they too may now get deleted.
+  */
+ void dput(struct dentry *dentry)
+ {
+ 	while (dentry) {
+ 		might_sleep();
+ 
+ 		rcu_read_lock();
+ 		if (likely(fast_dput(dentry))) {
+ 			rcu_read_unlock();
+ 			return;
+ 		}
+ 
+ 		/* Slow case: now with the dentry lock held */
+ 		rcu_read_unlock();
+ 
+ 		if (likely(retain_dentry(dentry))) {
+ 			spin_unlock(&dentry->d_lock);
+ 			return;
+ 		}
+ 
+ 		dentry = dentry_kill(dentry);
+ 	}
+ }
+ EXPORT_SYMBOL(dput);
+ 
+ static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+ __must_hold(&dentry->d_lock)
+ {
+ 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ 		/* let the owner of the list it's on deal with it */
+ 		--dentry->d_lockref.count;
+ 	} else {
+ 		if (dentry->d_flags & DCACHE_LRU_LIST)
+ 			d_lru_del(dentry);
+ 		if (!--dentry->d_lockref.count)
+ 			d_shrink_add(dentry, list);
+ 	}
+ }
+ 
+ void dput_to_list(struct dentry *dentry, struct list_head *list)
+ {
+ 	rcu_read_lock();
+ 	if (likely(fast_dput(dentry))) {
+ 		rcu_read_unlock();
+ 		return;
+ 	}
+ 	rcu_read_unlock();
+ 	if (!retain_dentry(dentry))
+ 		__dput_to_list(dentry, list);
+ 	spin_unlock(&dentry->d_lock);
+ }
+ 
+ /* This must be called with d_lock held */
+ static inline void __dget_dlock(struct dentry *dentry)
+ {
+ 	dentry->d_lockref.count++;
+ }
+ 
+ static inline void __dget(struct dentry *dentry)
+ {
+ 	lockref_get(&dentry->d_lockref);
+ }
+ 
+ struct dentry *dget_parent(struct dentry *dentry)
+ {
+ 	int gotref;
+ 	struct dentry *ret;
+ 	unsigned seq;
+ 
+ 	/*
+ 	 * Do optimistic parent lookup without any
+ 	 * locking.
+ 	 */
+ 	rcu_read_lock();
+ 	seq = raw_seqcount_begin(&dentry->d_seq);
+ 	ret = READ_ONCE(dentry->d_parent);
+ 	gotref = lockref_get_not_zero(&ret->d_lockref);
+ 	rcu_read_unlock();
+ 	if (likely(gotref)) {
+ 		if (!read_seqcount_retry(&dentry->d_seq, seq))
+ 			return ret;
+ 		dput(ret);
+ 	}
+ 
+ repeat:
+ 	/*
+ 	 * Don't need rcu_dereference because we re-check it was correct under
+ 	 * the lock.
+ 	 */
+ 	rcu_read_lock();
+ 	ret = dentry->d_parent;
+ 	spin_lock(&ret->d_lock);
+ 	if (unlikely(ret != dentry->d_parent)) {
+ 		spin_unlock(&ret->d_lock);
+ 		rcu_read_unlock();
+ 		goto repeat;
+ 	}
+ 	rcu_read_unlock();
+ 	BUG_ON(!ret->d_lockref.count);
+ 	ret->d_lockref.count++;
+ 	spin_unlock(&ret->d_lock);
+ 	return ret;
+ }
+ EXPORT_SYMBOL(dget_parent);
+ 
+ static struct dentry * __d_find_any_alias(struct inode *inode)
+ {
+ 	struct dentry *alias;
+ 
+ 	if (hlist_empty(&inode->i_dentry))
+ 		return NULL;
+ 	alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+ 	__dget(alias);
+ 	return alias;
+ }
+ 
+ /**
+  * d_find_any_alias - find any alias for a given inode
+  * @inode: inode to find an alias for
+  *
+  * If any aliases exist for the given inode, take and return a
+  * reference for one of them.  If no aliases exist, return %NULL.
+  */
+ struct dentry *d_find_any_alias(struct inode *inode)
+ {
+ 	struct dentry *de;
+ 
+ 	spin_lock(&inode->i_lock);
+ 	de = __d_find_any_alias(inode);
+ 	spin_unlock(&inode->i_lock);
+ 	return de;
+ }
+ EXPORT_SYMBOL(d_find_any_alias);
+ 
+ /**
+  * d_find_alias - grab a hashed alias of inode
+  * @inode: inode in question
+  *
+  * If inode has a hashed alias, or is a directory and has any alias,
+  * acquire the reference to alias and return it. Otherwise return NULL.
+  * Notice that if inode is a directory there can be only one alias and
+  * it can be unhashed only if it has no children, or if it is the root
+  * of a filesystem, or if the directory was renamed and d_revalidate
+  * was the first vfs operation to notice.
+  *
+  * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
+  * any other hashed alias over that one.
+  */
+ static struct dentry *__d_find_alias(struct inode *inode)
+ {
+ 	struct dentry *alias;
+ 
+ 	if (S_ISDIR(inode->i_mode))
+ 		return __d_find_any_alias(inode);
+ 
+ 	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ 		spin_lock(&alias->d_lock);
+  		if (!d_unhashed(alias)) {
+ 			__dget_dlock(alias);
+ 			spin_unlock(&alias->d_lock);
+ 			return alias;
+ 		}
+ 		spin_unlock(&alias->d_lock);
+ 	}
+ 	return NULL;
+ }
+ 
+ struct dentry *d_find_alias(struct inode *inode)
+ {
+ 	struct dentry *de = NULL;
+ 
+ 	if (!hlist_empty(&inode->i_dentry)) {
+ 		spin_lock(&inode->i_lock);
+ 		de = __d_find_alias(inode);
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ 	return de;
+ }
+ EXPORT_SYMBOL(d_find_alias);
+ 
+ /*
+  *	Try to kill dentries associated with this inode.
+  * WARNING: you must own a reference to inode.
+  */
+ void d_prune_aliases(struct inode *inode)
+ {
+ 	struct dentry *dentry;
+ restart:
+ 	spin_lock(&inode->i_lock);
+ 	hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
+ 		spin_lock(&dentry->d_lock);
+ 		if (!dentry->d_lockref.count) {
+ 			struct dentry *parent = lock_parent(dentry);
+ 			if (likely(!dentry->d_lockref.count)) {
+ 				__dentry_kill(dentry);
+ 				dput(parent);
+ 				goto restart;
+ 			}
+ 			if (parent)
+ 				spin_unlock(&parent->d_lock);
+ 		}
+ 		spin_unlock(&dentry->d_lock);
+ 	}
+ 	spin_unlock(&inode->i_lock);
+ }
+ EXPORT_SYMBOL(d_prune_aliases);
+ 
+ /*
+  * Lock a dentry from shrink list.
+  * Called under rcu_read_lock() and dentry->d_lock; the former
+  * guarantees that nothing we access will be freed under us.
+  * Note that dentry is *not* protected from concurrent dentry_kill(),
+  * d_delete(), etc.
+  *
+  * Return false if dentry has been disrupted or grabbed, leaving
+  * the caller to kick it off-list.  Otherwise, return true and have
+  * that dentry's inode and parent both locked.
+  */
+ static bool shrink_lock_dentry(struct dentry *dentry)
+ {
+ 	struct inode *inode;
+ 	struct dentry *parent;
+ 
+ 	if (dentry->d_lockref.count)
+ 		return false;
+ 
+ 	inode = dentry->d_inode;
+ 	if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
+ 		spin_unlock(&dentry->d_lock);
+ 		spin_lock(&inode->i_lock);
+ 		spin_lock(&dentry->d_lock);
+ 		if (unlikely(dentry->d_lockref.count))
+ 			goto out;
+ 		/* changed inode means that somebody had grabbed it */
+ 		if (unlikely(inode != dentry->d_inode))
+ 			goto out;
+ 	}
+ 
+ 	parent = dentry->d_parent;
+ 	if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
+ 		return true;
+ 
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_lock(&parent->d_lock);
+ 	if (unlikely(parent != dentry->d_parent)) {
+ 		spin_unlock(&parent->d_lock);
+ 		spin_lock(&dentry->d_lock);
+ 		goto out;
+ 	}
+ 	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ 	if (likely(!dentry->d_lockref.count))
+ 		return true;
+ 	spin_unlock(&parent->d_lock);
+ out:
+ 	if (inode)
+ 		spin_unlock(&inode->i_lock);
+ 	return false;
+ }
+ 
+ void shrink_dentry_list(struct list_head *list)
+ {
+ 	while (!list_empty(list)) {
+ 		struct dentry *dentry, *parent;
+ 
+ 		dentry = list_entry(list->prev, struct dentry, d_lru);
+ 		spin_lock(&dentry->d_lock);
+ 		rcu_read_lock();
+ 		if (!shrink_lock_dentry(dentry)) {
+ 			bool can_free = false;
+ 			rcu_read_unlock();
+ 			d_shrink_del(dentry);
+ 			if (dentry->d_lockref.count < 0)
+ 				can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ 			spin_unlock(&dentry->d_lock);
+ 			if (can_free)
+ 				dentry_free(dentry);
+ 			continue;
+ 		}
+ 		rcu_read_unlock();
+ 		d_shrink_del(dentry);
+ 		parent = dentry->d_parent;
+ 		if (parent != dentry)
+ 			__dput_to_list(parent, list);
+ 		__dentry_kill(dentry);
+ 	}
+ }
+ 
+ static enum lru_status dentry_lru_isolate(struct list_head *item,
+ 		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ {
+ 	struct list_head *freeable = arg;
+ 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+ 
+ 
+ 	/*
+ 	 * we are inverting the lru lock/dentry->d_lock here,
+ 	 * so use a trylock. If we fail to get the lock, just skip
+ 	 * it
+ 	 */
+ 	if (!spin_trylock(&dentry->d_lock))
+ 		return LRU_SKIP;
+ 
+ 	/*
+ 	 * Referenced dentries are still in use. If they have active
+ 	 * counts, just remove them from the LRU. Otherwise give them
+ 	 * another pass through the LRU.
+ 	 */
+ 	if (dentry->d_lockref.count) {
+ 		d_lru_isolate(lru, dentry);
+ 		spin_unlock(&dentry->d_lock);
+ 		return LRU_REMOVED;
+ 	}
+ 
+ 	if (dentry->d_flags & DCACHE_REFERENCED) {
+ 		dentry->d_flags &= ~DCACHE_REFERENCED;
+ 		spin_unlock(&dentry->d_lock);
+ 
+ 		/*
+ 		 * The list move itself will be made by the common LRU code. At
+ 		 * this point, we've dropped the dentry->d_lock but keep the
+ 		 * lru lock. This is safe to do, since every list movement is
+ 		 * protected by the lru lock even if both locks are held.
+ 		 *
+ 		 * This is guaranteed by the fact that all LRU management
+ 		 * functions are intermediated by the LRU API calls like
+ 		 * list_lru_add and list_lru_del. List movement in this file
+ 		 * only ever occur through this functions or through callbacks
+ 		 * like this one, that are called from the LRU API.
+ 		 *
+ 		 * The only exceptions to this are functions like
+ 		 * shrink_dentry_list, and code that first checks for the
+ 		 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
+ 		 * operating only with stack provided lists after they are
+ 		 * properly isolated from the main list.  It is thus, always a
+ 		 * local access.
+ 		 */
+ 		return LRU_ROTATE;
+ 	}
+ 
+ 	d_lru_shrink_move(lru, dentry, freeable);
+ 	spin_unlock(&dentry->d_lock);
+ 
+ 	return LRU_REMOVED;
+ }
+ 
+ /**
+  * prune_dcache_sb - shrink the dcache
+  * @sb: superblock
+  * @sc: shrink control, passed to list_lru_shrink_walk()
+  *
+  * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+  * is done when we need more memory and called from the superblock shrinker
+  * function.
+  *
+  * This function may fail to free any resources if all the dentries are in
+  * use.
+  */
+ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
+ {
+ 	LIST_HEAD(dispose);
+ 	long freed;
+ 
+ 	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+ 				     dentry_lru_isolate, &dispose);
+ 	shrink_dentry_list(&dispose);
+ 	return freed;
+ }
+ 
+ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+ 		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+ {
+ 	struct list_head *freeable = arg;
+ 	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+ 
+ 	/*
+ 	 * we are inverting the lru lock/dentry->d_lock here,
+ 	 * so use a trylock. If we fail to get the lock, just skip
+ 	 * it
+ 	 */
+ 	if (!spin_trylock(&dentry->d_lock))
+ 		return LRU_SKIP;
+ 
+ 	d_lru_shrink_move(lru, dentry, freeable);
+ 	spin_unlock(&dentry->d_lock);
+ 
+ 	return LRU_REMOVED;
+ }
+ 
+ 
+ /**
+  * shrink_dcache_sb - shrink dcache for a superblock
+  * @sb: superblock
+  *
+  * Shrink the dcache for the specified super block. This is used to free
+  * the dcache before unmounting a file system.
+  */
+ void shrink_dcache_sb(struct super_block *sb)
+ {
+ 	do {
+ 		LIST_HEAD(dispose);
+ 
+ 		list_lru_walk(&sb->s_dentry_lru,
+ 			dentry_lru_isolate_shrink, &dispose, 1024);
+ 		shrink_dentry_list(&dispose);
+ 	} while (list_lru_count(&sb->s_dentry_lru) > 0);
+ }
+ EXPORT_SYMBOL(shrink_dcache_sb);
+ 
+ /**
+  * enum d_walk_ret - action to talke during tree walk
+  * @D_WALK_CONTINUE:	contrinue walk
+  * @D_WALK_QUIT:	quit walk
+  * @D_WALK_NORETRY:	quit when retry is needed
+  * @D_WALK_SKIP:	skip this dentry and its children
+  */
+ enum d_walk_ret {
+ 	D_WALK_CONTINUE,
+ 	D_WALK_QUIT,
+ 	D_WALK_NORETRY,
+ 	D_WALK_SKIP,
+ };
+ 
+ /**
+  * d_walk - walk the dentry tree
+  * @parent:	start of walk
+  * @data:	data passed to @enter() and @finish()
+  * @enter:	callback when first entering the dentry
+  *
+  * The @enter() callbacks are called with d_lock held.
+  */
+ static void d_walk(struct dentry *parent, void *data,
+ 		   enum d_walk_ret (*enter)(void *, struct dentry *))
+ {
+ 	struct dentry *this_parent;
+ 	struct list_head *next;
+ 	unsigned seq = 0;
+ 	enum d_walk_ret ret;
+ 	bool retry = true;
+ 
+ again:
+ 	read_seqbegin_or_lock(&rename_lock, &seq);
+ 	this_parent = parent;
+ 	spin_lock(&this_parent->d_lock);
+ 
+ 	ret = enter(data, this_parent);
+ 	switch (ret) {
+ 	case D_WALK_CONTINUE:
+ 		break;
+ 	case D_WALK_QUIT:
+ 	case D_WALK_SKIP:
+ 		goto out_unlock;
+ 	case D_WALK_NORETRY:
+ 		retry = false;
+ 		break;
+ 	}
+ repeat:
+ 	next = this_parent->d_subdirs.next;
+ resume:
+ 	while (next != &this_parent->d_subdirs) {
+ 		struct list_head *tmp = next;
+ 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+ 		next = tmp->next;
+ 
+ 		if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
+ 			continue;
+ 
+ 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ 
+ 		ret = enter(data, dentry);
+ 		switch (ret) {
+ 		case D_WALK_CONTINUE:
+ 			break;
+ 		case D_WALK_QUIT:
+ 			spin_unlock(&dentry->d_lock);
+ 			goto out_unlock;
+ 		case D_WALK_NORETRY:
+ 			retry = false;
+ 			break;
+ 		case D_WALK_SKIP:
+ 			spin_unlock(&dentry->d_lock);
+ 			continue;
+ 		}
+ 
+ 		if (!list_empty(&dentry->d_subdirs)) {
+ 			spin_unlock(&this_parent->d_lock);
+ 			spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+ 			this_parent = dentry;
+ 			spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
+ 			goto repeat;
+ 		}
+ 		spin_unlock(&dentry->d_lock);
+ 	}
+ 	/*
+ 	 * All done at this level ... ascend and resume the search.
+ 	 */
+ 	rcu_read_lock();
+ ascend:
+ 	if (this_parent != parent) {
+ 		struct dentry *child = this_parent;
+ 		this_parent = child->d_parent;
+ 
+ 		spin_unlock(&child->d_lock);
+ 		spin_lock(&this_parent->d_lock);
+ 
+ 		/* might go back up the wrong parent if we have had a rename. */
+ 		if (need_seqretry(&rename_lock, seq))
+ 			goto rename_retry;
+ 		/* go into the first sibling still alive */
+ 		do {
+ 			next = child->d_child.next;
+ 			if (next == &this_parent->d_subdirs)
+ 				goto ascend;
+ 			child = list_entry(next, struct dentry, d_child);
+ 		} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
+ 		rcu_read_unlock();
+ 		goto resume;
+ 	}
+ 	if (need_seqretry(&rename_lock, seq))
+ 		goto rename_retry;
+ 	rcu_read_unlock();
+ 
+ out_unlock:
+ 	spin_unlock(&this_parent->d_lock);
+ 	done_seqretry(&rename_lock, seq);
+ 	return;
+ 
+ rename_retry:
+ 	spin_unlock(&this_parent->d_lock);
+ 	rcu_read_unlock();
+ 	BUG_ON(seq & 1);
+ 	if (!retry)
+ 		return;
+ 	seq = 1;
+ 	goto again;
+ }
+ 
+ struct check_mount {
+ 	struct vfsmount *mnt;
+ 	unsigned int mounted;
+ };
+ 
+ static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
+ {
+ 	struct check_mount *info = data;
+ 	struct path path = { .mnt = info->mnt, .dentry = dentry };
+ 
+ 	if (likely(!d_mountpoint(dentry)))
+ 		return D_WALK_CONTINUE;
+ 	if (__path_is_mountpoint(&path)) {
+ 		info->mounted = 1;
+ 		return D_WALK_QUIT;
+ 	}
+ 	return D_WALK_CONTINUE;
+ }
+ 
+ /**
+  * path_has_submounts - check for mounts over a dentry in the
+  *                      current namespace.
+  * @parent: path to check.
+  *
+  * Return true if the parent or its subdirectories contain
+  * a mount point in the current namespace.
+  */
+ int path_has_submounts(const struct path *parent)
+ {
+ 	struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
+ 
+ 	read_seqlock_excl(&mount_lock);
+ 	d_walk(parent->dentry, &data, path_check_mount);
+ 	read_sequnlock_excl(&mount_lock);
+ 
+ 	return data.mounted;
+ }
+ EXPORT_SYMBOL(path_has_submounts);
+ 
+ /*
+  * Called by mount code to set a mountpoint and check if the mountpoint is
+  * reachable (e.g. NFS can unhash a directory dentry and then the complete
+  * subtree can become unreachable).
+  *
+  * Only one of d_invalidate() and d_set_mounted() must succeed.  For
+  * this reason take rename_lock and d_lock on dentry and ancestors.
+  */
+ int d_set_mounted(struct dentry *dentry)
+ {
+ 	struct dentry *p;
+ 	int ret = -ENOENT;
+ 	write_seqlock(&rename_lock);
+ 	for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
+ 		/* Need exclusion wrt. d_invalidate() */
+ 		spin_lock(&p->d_lock);
+ 		if (unlikely(d_unhashed(p))) {
+ 			spin_unlock(&p->d_lock);
+ 			goto out;
+ 		}
+ 		spin_unlock(&p->d_lock);
+ 	}
+ 	spin_lock(&dentry->d_lock);
+ 	if (!d_unlinked(dentry)) {
+ 		ret = -EBUSY;
+ 		if (!d_mountpoint(dentry)) {
+ 			dentry->d_flags |= DCACHE_MOUNTED;
+ 			ret = 0;
+ 		}
+ 	}
+  	spin_unlock(&dentry->d_lock);
+ out:
+ 	write_sequnlock(&rename_lock);
+ 	return ret;
+ }
+ 
+ /*
+  * Search the dentry child list of the specified parent,
+  * and move any unused dentries to the end of the unused
+  * list for prune_dcache(). We descend to the next level
+  * whenever the d_subdirs list is non-empty and continue
+  * searching.
+  *
+  * It returns zero iff there are no unused children,
+  * otherwise  it returns the number of children moved to
+  * the end of the unused list. This may not be the total
+  * number of unused children, because select_parent can
+  * drop the lock and return early due to latency
+  * constraints.
+  */
+ 
+ struct select_data {
+ 	struct dentry *start;
+ 	union {
+ 		long found;
+ 		struct dentry *victim;
+ 	};
+ 	struct list_head dispose;
+ };
+ 
+ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
+ {
+ 	struct select_data *data = _data;
+ 	enum d_walk_ret ret = D_WALK_CONTINUE;
+ 
+ 	if (data->start == dentry)
+ 		goto out;
+ 
+ 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ 		data->found++;
+ 	} else {
+ 		if (dentry->d_flags & DCACHE_LRU_LIST)
+ 			d_lru_del(dentry);
+ 		if (!dentry->d_lockref.count) {
+ 			d_shrink_add(dentry, &data->dispose);
+ 			data->found++;
+ 		}
+ 	}
+ 	/*
+ 	 * We can return to the caller if we have found some (this
+ 	 * ensures forward progress). We'll be coming back to find
+ 	 * the rest.
+ 	 */
+ 	if (!list_empty(&data->dispose))
+ 		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+ out:
+ 	return ret;
+ }
+ 
+ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
+ {
+ 	struct select_data *data = _data;
+ 	enum d_walk_ret ret = D_WALK_CONTINUE;
+ 
+ 	if (data->start == dentry)
+ 		goto out;
+ 
+ 	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ 		if (!dentry->d_lockref.count) {
+ 			rcu_read_lock();
+ 			data->victim = dentry;
+ 			return D_WALK_QUIT;
+ 		}
+ 	} else {
+ 		if (dentry->d_flags & DCACHE_LRU_LIST)
+ 			d_lru_del(dentry);
+ 		if (!dentry->d_lockref.count)
+ 			d_shrink_add(dentry, &data->dispose);
+ 	}
+ 	/*
+ 	 * We can return to the caller if we have found some (this
+ 	 * ensures forward progress). We'll be coming back to find
+ 	 * the rest.
+ 	 */
+ 	if (!list_empty(&data->dispose))
+ 		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
+ out:
+ 	return ret;
+ }
+ 
+ /**
+  * shrink_dcache_parent - prune dcache
+  * @parent: parent of entries to prune
+  *
+  * Prune the dcache to remove unused children of the parent dentry.
+  */
+ void shrink_dcache_parent(struct dentry *parent)
+ {
+ 	for (;;) {
+ 		struct select_data data = {.start = parent};
+ 
+ 		INIT_LIST_HEAD(&data.dispose);
+ 		d_walk(parent, &data, select_collect);
+ 
+ 		if (!list_empty(&data.dispose)) {
+ 			shrink_dentry_list(&data.dispose);
+ 			continue;
+ 		}
+ 
+ 		cond_resched();
+ 		if (!data.found)
+ 			break;
+ 		data.victim = NULL;
+ 		d_walk(parent, &data, select_collect2);
+ 		if (data.victim) {
+ 			struct dentry *parent;
+ 			spin_lock(&data.victim->d_lock);
+ 			if (!shrink_lock_dentry(data.victim)) {
+ 				spin_unlock(&data.victim->d_lock);
+ 				rcu_read_unlock();
+ 			} else {
+ 				rcu_read_unlock();
+ 				parent = data.victim->d_parent;
+ 				if (parent != data.victim)
+ 					__dput_to_list(parent, &data.dispose);
+ 				__dentry_kill(data.victim);
+ 			}
+ 		}
+ 		if (!list_empty(&data.dispose))
+ 			shrink_dentry_list(&data.dispose);
+ 	}
+ }
+ EXPORT_SYMBOL(shrink_dcache_parent);
+ 
+ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
+ {
+ 	/* it has busy descendents; complain about those instead */
+ 	if (!list_empty(&dentry->d_subdirs))
+ 		return D_WALK_CONTINUE;
+ 
+ 	/* root with refcount 1 is fine */
+ 	if (dentry == _data && dentry->d_lockref.count == 1)
+ 		return D_WALK_CONTINUE;
+ 
+ 	printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+ 			" still in use (%d) [unmount of %s %s]\n",
+ 		       dentry,
+ 		       dentry->d_inode ?
+ 		       dentry->d_inode->i_ino : 0UL,
+ 		       dentry,
+ 		       dentry->d_lockref.count,
+ 		       dentry->d_sb->s_type->name,
+ 		       dentry->d_sb->s_id);
+ 	WARN_ON(1);
+ 	return D_WALK_CONTINUE;
+ }
+ 
+ static void do_one_tree(struct dentry *dentry)
+ {
+ 	shrink_dcache_parent(dentry);
+ 	d_walk(dentry, dentry, umount_check);
+ 	d_drop(dentry);
+ 	dput(dentry);
+ }
+ 
+ /*
+  * destroy the dentries attached to a superblock on unmounting
+  */
+ void shrink_dcache_for_umount(struct super_block *sb)
+ {
+ 	struct dentry *dentry;
+ 
+ 	WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
+ 
+ 	dentry = sb->s_root;
+ 	sb->s_root = NULL;
+ 	do_one_tree(dentry);
+ 
+ 	while (!hlist_bl_empty(&sb->s_roots)) {
+ 		dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
+ 		do_one_tree(dentry);
+ 	}
+ }
+ 
+ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
+ {
+ 	struct dentry **victim = _data;
+ 	if (d_mountpoint(dentry)) {
+ 		__dget_dlock(dentry);
+ 		*victim = dentry;
+ 		return D_WALK_QUIT;
+ 	}
+ 	return D_WALK_CONTINUE;
+ }
+ 
+ /**
+  * d_invalidate - detach submounts, prune dcache, and drop
+  * @dentry: dentry to invalidate (aka detach, prune and drop)
+  */
+ void d_invalidate(struct dentry *dentry)
+ {
+ 	bool had_submounts = false;
+ 	spin_lock(&dentry->d_lock);
+ 	if (d_unhashed(dentry)) {
+ 		spin_unlock(&dentry->d_lock);
+ 		return;
+ 	}
+ 	__d_drop(dentry);
+ 	spin_unlock(&dentry->d_lock);
+ 
+ 	/* Negative dentries can be dropped without further checks */
+ 	if (!dentry->d_inode)
+ 		return;
+ 
+ 	shrink_dcache_parent(dentry);
+ 	for (;;) {
+ 		struct dentry *victim = NULL;
+ 		d_walk(dentry, &victim, find_submount);
+ 		if (!victim) {
+ 			if (had_submounts)
+ 				shrink_dcache_parent(dentry);
+ 			return;
+ 		}
+ 		had_submounts = true;
+ 		detach_mounts(victim);
+ 		dput(victim);
+ 	}
+ }
+ EXPORT_SYMBOL(d_invalidate);
+ 
+ /**
+  * __d_alloc	-	allocate a dcache entry
+  * @sb: filesystem it will belong to
+  * @name: qstr of the name
+  *
+  * Allocates a dentry. It returns %NULL if there is insufficient memory
+  * available. On a success the dentry is returned. The name passed in is
+  * copied and the copy passed in may be reused after this call.
+  */
+  
+ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
+ {
+ 	struct dentry *dentry;
+ 	char *dname;
+ 	int err;
+ 
+ 	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ 	if (!dentry)
+ 		return NULL;
+ 
+ 	/*
+ 	 * We guarantee that the inline name is always NUL-terminated.
+ 	 * This way the memcpy() done by the name switching in rename
+ 	 * will still always have a NUL at the end, even if we might
+ 	 * be overwriting an internal NUL character
+ 	 */
+ 	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
+ 	if (unlikely(!name)) {
+ 		name = &slash_name;
+ 		dname = dentry->d_iname;
+ 	} else if (name->len > DNAME_INLINE_LEN-1) {
+ 		size_t size = offsetof(struct external_name, name[1]);
+ 		struct external_name *p = kmalloc(size + name->len,
+ 						  GFP_KERNEL_ACCOUNT |
+ 						  __GFP_RECLAIMABLE);
+ 		if (!p) {
+ 			kmem_cache_free(dentry_cache, dentry); 
+ 			return NULL;
+ 		}
+ 		atomic_set(&p->u.count, 1);
+ 		dname = p->name;
+ 	} else  {
+ 		dname = dentry->d_iname;
+ 	}	
+ 
+ 	dentry->d_name.len = name->len;
+ 	dentry->d_name.hash = name->hash;
+ 	memcpy(dname, name->name, name->len);
+ 	dname[name->len] = 0;
+ 
+ 	/* Make sure we always see the terminating NUL character */
+ 	smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
+ 
+ 	dentry->d_lockref.count = 1;
+ 	dentry->d_flags = 0;
+ 	spin_lock_init(&dentry->d_lock);
+ 	seqcount_init(&dentry->d_seq);
+ 	dentry->d_inode = NULL;
+ 	dentry->d_parent = dentry;
+ 	dentry->d_sb = sb;
+ 	dentry->d_op = NULL;
+ 	dentry->d_fsdata = NULL;
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	atomic_set(&dentry->chroot_refcnt, 0);
+ #endif
+ 	INIT_HLIST_BL_NODE(&dentry->d_hash);
+ 	INIT_LIST_HEAD(&dentry->d_lru);
+ 	INIT_LIST_HEAD(&dentry->d_subdirs);
+ 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
+ 	INIT_LIST_HEAD(&dentry->d_child);
+ 	d_set_d_op(dentry, dentry->d_sb->s_d_op);
+ 
+ 	if (dentry->d_op && dentry->d_op->d_init) {
+ 		err = dentry->d_op->d_init(dentry);
+ 		if (err) {
+ 			if (dname_external(dentry))
+ 				kfree(external_name(dentry));
+ 			kmem_cache_free(dentry_cache, dentry);
+ 			return NULL;
+ 		}
+ 	}
+ 
+ 	this_cpu_inc(nr_dentry);
+ 
+ 	return dentry;
+ }
+ 
+ /**
+  * d_alloc	-	allocate a dcache entry
+  * @parent: parent of entry to allocate
+  * @name: qstr of the name
+  *
+  * Allocates a dentry. It returns %NULL if there is insufficient memory
+  * available. On a success the dentry is returned. The name passed in is
+  * copied and the copy passed in may be reused after this call.
+  */
+ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
+ {
+ 	struct dentry *dentry = __d_alloc(parent->d_sb, name);
+ 	if (!dentry)
+ 		return NULL;
+ 	spin_lock(&parent->d_lock);
+ 	/*
+ 	 * don't need child lock because it is not subject
+ 	 * to concurrency here
+ 	 */
+ 	__dget_dlock(parent);
+ 	dentry->d_parent = parent;
+ 	list_add(&dentry->d_child, &parent->d_subdirs);
+ 	spin_unlock(&parent->d_lock);
+ 
+ 	return dentry;
+ }
+ EXPORT_SYMBOL(d_alloc);
+ 
+ struct dentry *d_alloc_anon(struct super_block *sb)
+ {
+ 	return __d_alloc(sb, NULL);
+ }
+ EXPORT_SYMBOL(d_alloc_anon);
+ 
+ struct dentry *d_alloc_cursor(struct dentry * parent)
+ {
+ 	struct dentry *dentry = d_alloc_anon(parent->d_sb);
+ 	if (dentry) {
+ 		dentry->d_flags |= DCACHE_DENTRY_CURSOR;
+ 		dentry->d_parent = dget(parent);
+ 	}
+ 	return dentry;
+ }
+ 
+ /**
+  * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
+  * @sb: the superblock
+  * @name: qstr of the name
+  *
+  * For a filesystem that just pins its dentries in memory and never
+  * performs lookups at all, return an unhashed IS_ROOT dentry.
+  * This is used for pipes, sockets et.al. - the stuff that should
+  * never be anyone's children or parents.  Unlike all other
+  * dentries, these will not have RCU delay between dropping the
+  * last reference and freeing them.
+  *
+  * The only user is alloc_file_pseudo() and that's what should
+  * be considered a public interface.  Don't use directly.
+  */
+ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
+ {
+ 	struct dentry *dentry = __d_alloc(sb, name);
+ 	if (likely(dentry))
+ 		dentry->d_flags |= DCACHE_NORCU;
+ 	return dentry;
+ }
+ 
+ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
+ {
+ 	struct qstr q;
+ 
+ 	q.name = name;
+ 	q.hash_len = hashlen_string(parent, name);
+ 	return d_alloc(parent, &q);
+ }
+ EXPORT_SYMBOL(d_alloc_name);
+ 
+ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+ {
+ 	WARN_ON_ONCE(dentry->d_op);
+ 	WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH	|
+ 				DCACHE_OP_COMPARE	|
+ 				DCACHE_OP_REVALIDATE	|
+ 				DCACHE_OP_WEAK_REVALIDATE	|
+ 				DCACHE_OP_DELETE	|
+ 				DCACHE_OP_REAL));
+ 	dentry->d_op = op;
+ 	if (!op)
+ 		return;
+ 	if (op->d_hash)
+ 		dentry->d_flags |= DCACHE_OP_HASH;
+ 	if (op->d_compare)
+ 		dentry->d_flags |= DCACHE_OP_COMPARE;
+ 	if (op->d_revalidate)
+ 		dentry->d_flags |= DCACHE_OP_REVALIDATE;
+ 	if (op->d_weak_revalidate)
+ 		dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
+ 	if (op->d_delete)
+ 		dentry->d_flags |= DCACHE_OP_DELETE;
+ 	if (op->d_prune)
+ 		dentry->d_flags |= DCACHE_OP_PRUNE;
+ 	if (op->d_real)
+ 		dentry->d_flags |= DCACHE_OP_REAL;
+ 
+ }
+ EXPORT_SYMBOL(d_set_d_op);
+ 
+ 
+ /*
+  * d_set_fallthru - Mark a dentry as falling through to a lower layer
+  * @dentry - The dentry to mark
+  *
+  * Mark a dentry as falling through to the lower layer (as set with
+  * d_pin_lower()).  This flag may be recorded on the medium.
+  */
+ void d_set_fallthru(struct dentry *dentry)
+ {
+ 	spin_lock(&dentry->d_lock);
+ 	dentry->d_flags |= DCACHE_FALLTHRU;
+ 	spin_unlock(&dentry->d_lock);
+ }
+ EXPORT_SYMBOL(d_set_fallthru);
+ 
+ static unsigned d_flags_for_inode(struct inode *inode)
+ {
+ 	unsigned add_flags = DCACHE_REGULAR_TYPE;
+ 
+ 	if (!inode)
+ 		return DCACHE_MISS_TYPE;
+ 
+ 	if (S_ISDIR(inode->i_mode)) {
+ 		add_flags = DCACHE_DIRECTORY_TYPE;
+ 		if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
+ 			if (unlikely(!inode->i_op->lookup))
+ 				add_flags = DCACHE_AUTODIR_TYPE;
+ 			else
+ 				inode->i_opflags |= IOP_LOOKUP;
+ 		}
+ 		goto type_determined;
+ 	}
+ 
+ 	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+ 		if (unlikely(inode->i_op->get_link)) {
+ 			add_flags = DCACHE_SYMLINK_TYPE;
+ 			goto type_determined;
+ 		}
+ 		inode->i_opflags |= IOP_NOFOLLOW;
+ 	}
+ 
+ 	if (unlikely(!S_ISREG(inode->i_mode)))
+ 		add_flags = DCACHE_SPECIAL_TYPE;
+ 
+ type_determined:
+ 	if (unlikely(IS_AUTOMOUNT(inode)))
+ 		add_flags |= DCACHE_NEED_AUTOMOUNT;
+ 	return add_flags;
+ }
+ 
+ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
+ {
+ 	unsigned add_flags = d_flags_for_inode(inode);
+ 	WARN_ON(d_in_lookup(dentry));
+ 
+ 	spin_lock(&dentry->d_lock);
+ 	/*
+ 	 * Decrement negative dentry count if it was in the LRU list.
+ 	 */
+ 	if (dentry->d_flags & DCACHE_LRU_LIST)
+ 		this_cpu_dec(nr_dentry_negative);
+ 	hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ 	raw_write_seqcount_begin(&dentry->d_seq);
+ 	__d_set_inode_and_type(dentry, inode, add_flags);
+ 	raw_write_seqcount_end(&dentry->d_seq);
+ 	fsnotify_update_flags(dentry);
+ 	spin_unlock(&dentry->d_lock);
+ }
+ 
+ /**
+  * d_instantiate - fill in inode information for a dentry
+  * @entry: dentry to complete
+  * @inode: inode to attach to this dentry
+  *
+  * Fill in inode information in the entry.
+  *
+  * This turns negative dentries into productive full members
+  * of society.
+  *
+  * NOTE! This assumes that the inode count has been incremented
+  * (or otherwise set) by the caller to indicate that it is now
+  * in use by the dcache.
+  */
+  
+ void d_instantiate(struct dentry *entry, struct inode * inode)
+ {
+ 	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+ 	if (inode) {
+ 		security_d_instantiate(entry, inode);
+ 		spin_lock(&inode->i_lock);
+ 		__d_instantiate(entry, inode);
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ }
+ EXPORT_SYMBOL(d_instantiate);
+ 
+ /*
+  * This should be equivalent to d_instantiate() + unlock_new_inode(),
+  * with lockdep-related part of unlock_new_inode() done before
+  * anything else.  Use that instead of open-coding d_instantiate()/
+  * unlock_new_inode() combinations.
+  */
+ void d_instantiate_new(struct dentry *entry, struct inode *inode)
+ {
+ 	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+ 	BUG_ON(!inode);
+ 	lockdep_annotate_inode_mutex_key(inode);
+ 	security_d_instantiate(entry, inode);
+ 	spin_lock(&inode->i_lock);
+ 	__d_instantiate(entry, inode);
+ 	WARN_ON(!(inode->i_state & I_NEW));
+ 	inode->i_state &= ~I_NEW & ~I_CREATING;
+ 	smp_mb();
+ 	wake_up_bit(&inode->i_state, __I_NEW);
+ 	spin_unlock(&inode->i_lock);
+ }
+ EXPORT_SYMBOL(d_instantiate_new);
+ 
+ struct dentry *d_make_root(struct inode *root_inode)
+ {
+ 	struct dentry *res = NULL;
+ 
+ 	if (root_inode) {
+ 		res = d_alloc_anon(root_inode->i_sb);
+ 		if (res)
+ 			d_instantiate(res, root_inode);
+ 		else
+ 			iput(root_inode);
+ 	}
+ 	return res;
+ }
+ EXPORT_SYMBOL(d_make_root);
+ 
+ static struct dentry *__d_instantiate_anon(struct dentry *dentry,
+ 					   struct inode *inode,
+ 					   bool disconnected)
+ {
+ 	struct dentry *res;
+ 	unsigned add_flags;
+ 
+ 	security_d_instantiate(dentry, inode);
+ 	spin_lock(&inode->i_lock);
+ 	res = __d_find_any_alias(inode);
+ 	if (res) {
+ 		spin_unlock(&inode->i_lock);
+ 		dput(dentry);
+ 		goto out_iput;
+ 	}
+ 
+ 	/* attach a disconnected dentry */
+ 	add_flags = d_flags_for_inode(inode);
+ 
+ 	if (disconnected)
+ 		add_flags |= DCACHE_DISCONNECTED;
+ 
+ 	spin_lock(&dentry->d_lock);
+ 	__d_set_inode_and_type(dentry, inode, add_flags);
+ 	hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ 	if (!disconnected) {
+ 		hlist_bl_lock(&dentry->d_sb->s_roots);
+ 		hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
+ 		hlist_bl_unlock(&dentry->d_sb->s_roots);
+ 	}
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_unlock(&inode->i_lock);
+ 
+ 	return dentry;
+ 
+  out_iput:
+ 	iput(inode);
+ 	return res;
+ }
+ 
+ struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
+ {
+ 	return __d_instantiate_anon(dentry, inode, true);
+ }
+ EXPORT_SYMBOL(d_instantiate_anon);
+ 
+ static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
+ {
+ 	struct dentry *tmp;
+ 	struct dentry *res;
+ 
+ 	if (!inode)
+ 		return ERR_PTR(-ESTALE);
+ 	if (IS_ERR(inode))
+ 		return ERR_CAST(inode);
+ 
+ 	res = d_find_any_alias(inode);
+ 	if (res)
+ 		goto out_iput;
+ 
+ 	tmp = d_alloc_anon(inode->i_sb);
+ 	if (!tmp) {
+ 		res = ERR_PTR(-ENOMEM);
+ 		goto out_iput;
+ 	}
+ 
+ 	return __d_instantiate_anon(tmp, inode, disconnected);
+ 
+ out_iput:
+ 	iput(inode);
+ 	return res;
+ }
+ 
+ /**
+  * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
+  * @inode: inode to allocate the dentry for
+  *
+  * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+  * similar open by handle operations.  The returned dentry may be anonymous,
+  * or may have a full name (if the inode was already in the cache).
+  *
+  * When called on a directory inode, we must ensure that the inode only ever
+  * has one dentry.  If a dentry is found, that is returned instead of
+  * allocating a new one.
+  *
+  * On successful return, the reference to the inode has been transferred
+  * to the dentry.  In case of an error the reference on the inode is released.
+  * To make it easier to use in export operations a %NULL or IS_ERR inode may
+  * be passed in and the error will be propagated to the return value,
+  * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
+  */
+ struct dentry *d_obtain_alias(struct inode *inode)
+ {
+ 	return __d_obtain_alias(inode, true);
+ }
+ EXPORT_SYMBOL(d_obtain_alias);
+ 
+ /**
+  * d_obtain_root - find or allocate a dentry for a given inode
+  * @inode: inode to allocate the dentry for
+  *
+  * Obtain an IS_ROOT dentry for the root of a filesystem.
+  *
+  * We must ensure that directory inodes only ever have one dentry.  If a
+  * dentry is found, that is returned instead of allocating a new one.
+  *
+  * On successful return, the reference to the inode has been transferred
+  * to the dentry.  In case of an error the reference on the inode is
+  * released.  A %NULL or IS_ERR inode may be passed in and will be the
+  * error will be propagate to the return value, with a %NULL @inode
+  * replaced by ERR_PTR(-ESTALE).
+  */
+ struct dentry *d_obtain_root(struct inode *inode)
+ {
+ 	return __d_obtain_alias(inode, false);
+ }
+ EXPORT_SYMBOL(d_obtain_root);
+ 
+ /**
+  * d_add_ci - lookup or allocate new dentry with case-exact name
+  * @inode:  the inode case-insensitive lookup has found
+  * @dentry: the negative dentry that was passed to the parent's lookup func
+  * @name:   the case-exact name to be associated with the returned dentry
+  *
+  * This is to avoid filling the dcache with case-insensitive names to the
+  * same inode, only the actual correct case is stored in the dcache for
+  * case-insensitive filesystems.
+  *
+  * For a case-insensitive lookup match and if the the case-exact dentry
+  * already exists in in the dcache, use it and return it.
+  *
+  * If no entry exists with the exact case name, allocate new dentry with
+  * the exact case, and return the spliced entry.
+  */
+ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
+ 			struct qstr *name)
+ {
+ 	struct dentry *found, *res;
+ 
+ 	/*
+ 	 * First check if a dentry matching the name already exists,
+ 	 * if not go ahead and create it now.
+ 	 */
+ 	found = d_hash_and_lookup(dentry->d_parent, name);
+ 	if (found) {
+ 		iput(inode);
+ 		return found;
+ 	}
+ 	if (d_in_lookup(dentry)) {
+ 		found = d_alloc_parallel(dentry->d_parent, name,
+ 					dentry->d_wait);
+ 		if (IS_ERR(found) || !d_in_lookup(found)) {
+ 			iput(inode);
+ 			return found;
+ 		}
+ 	} else {
+ 		found = d_alloc(dentry->d_parent, name);
+ 		if (!found) {
+ 			iput(inode);
+ 			return ERR_PTR(-ENOMEM);
+ 		} 
+ 	}
+ 	res = d_splice_alias(inode, found);
+ 	if (res) {
+ 		dput(found);
+ 		return res;
+ 	}
+ 	return found;
+ }
+ EXPORT_SYMBOL(d_add_ci);
+ 
+ 
+ static inline bool d_same_name(const struct dentry *dentry,
+ 				const struct dentry *parent,
+ 				const struct qstr *name)
+ {
+ 	if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
+ 		if (dentry->d_name.len != name->len)
+ 			return false;
+ 		return dentry_cmp(dentry, name->name, name->len) == 0;
+ 	}
+ 	return parent->d_op->d_compare(dentry,
+ 				       dentry->d_name.len, dentry->d_name.name,
+ 				       name) == 0;
+ }
+ 
+ /**
+  * __d_lookup_rcu - search for a dentry (racy, store-free)
+  * @parent: parent dentry
+  * @name: qstr of name we wish to find
+  * @seqp: returns d_seq value at the point where the dentry was found
+  * Returns: dentry, or NULL
+  *
+  * __d_lookup_rcu is the dcache lookup function for rcu-walk name
+  * resolution (store-free path walking) design described in
+  * Documentation/filesystems/path-lookup.txt.
+  *
+  * This is not to be used outside core vfs.
+  *
+  * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
+  * held, and rcu_read_lock held. The returned dentry must not be stored into
+  * without taking d_lock and checking d_seq sequence count against @seq
+  * returned here.
+  *
+  * A refcount may be taken on the found dentry with the d_rcu_to_refcount
+  * function.
+  *
+  * Alternatively, __d_lookup_rcu may be called again to look up the child of
+  * the returned dentry, so long as its parent's seqlock is checked after the
+  * child is looked up. Thus, an interlocking stepping of sequence lock checks
+  * is formed, giving integrity down the path walk.
+  *
+  * NOTE! The caller *has* to check the resulting dentry against the sequence
+  * number we've returned before using any of the resulting dentry state!
+  */
+ struct dentry *__d_lookup_rcu(const struct dentry *parent,
+ 				const struct qstr *name,
+ 				unsigned *seqp)
+ {
+ 	u64 hashlen = name->hash_len;
+ 	const unsigned char *str = name->name;
+ 	struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
+ 	struct hlist_bl_node *node;
+ 	struct dentry *dentry;
+ 
+ 	/*
+ 	 * Note: There is significant duplication with __d_lookup_rcu which is
+ 	 * required to prevent single threaded performance regressions
+ 	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+ 	 * Keep the two functions in sync.
+ 	 */
+ 
+ 	/*
+ 	 * The hash list is protected using RCU.
+ 	 *
+ 	 * Carefully use d_seq when comparing a candidate dentry, to avoid
+ 	 * races with d_move().
+ 	 *
+ 	 * It is possible that concurrent renames can mess up our list
+ 	 * walk here and result in missing our dentry, resulting in the
+ 	 * false-negative result. d_lookup() protects against concurrent
+ 	 * renames using rename_lock seqlock.
+ 	 *
+ 	 * See Documentation/filesystems/path-lookup.txt for more details.
+ 	 */
+ 	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ 		unsigned seq;
+ 
+ seqretry:
+ 		/*
+ 		 * The dentry sequence count protects us from concurrent
+ 		 * renames, and thus protects parent and name fields.
+ 		 *
+ 		 * The caller must perform a seqcount check in order
+ 		 * to do anything useful with the returned dentry.
+ 		 *
+ 		 * NOTE! We do a "raw" seqcount_begin here. That means that
+ 		 * we don't wait for the sequence count to stabilize if it
+ 		 * is in the middle of a sequence change. If we do the slow
+ 		 * dentry compare, we will do seqretries until it is stable,
+ 		 * and if we end up with a successful lookup, we actually
+ 		 * want to exit RCU lookup anyway.
+ 		 *
+ 		 * Note that raw_seqcount_begin still *does* smp_rmb(), so
+ 		 * we are still guaranteed NUL-termination of ->d_name.name.
+ 		 */
+ 		seq = raw_seqcount_begin(&dentry->d_seq);
+ 		if (dentry->d_parent != parent)
+ 			continue;
+ 		if (d_unhashed(dentry))
+ 			continue;
+ 
+ 		if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
+ 			int tlen;
+ 			const char *tname;
+ 			if (dentry->d_name.hash != hashlen_hash(hashlen))
+ 				continue;
+ 			tlen = dentry->d_name.len;
+ 			tname = dentry->d_name.name;
+ 			/* we want a consistent (name,len) pair */
+ 			if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ 				cpu_relax();
+ 				goto seqretry;
+ 			}
+ 			if (parent->d_op->d_compare(dentry,
+ 						    tlen, tname, name) != 0)
+ 				continue;
+ 		} else {
+ 			if (dentry->d_name.hash_len != hashlen)
+ 				continue;
+ 			if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ 				continue;
+ 		}
+ 		*seqp = seq;
+ 		return dentry;
+ 	}
+ 	return NULL;
+ }
+ 
+ /**
+  * d_lookup - search for a dentry
+  * @parent: parent dentry
+  * @name: qstr of name we wish to find
+  * Returns: dentry, or NULL
+  *
+  * d_lookup searches the children of the parent dentry for the name in
+  * question. If the dentry is found its reference count is incremented and the
+  * dentry is returned. The caller must use dput to free the entry when it has
+  * finished using it. %NULL is returned if the dentry does not exist.
+  */
+ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
+ {
+ 	struct dentry *dentry;
+ 	unsigned seq;
+ 
+ 	do {
+ 		seq = read_seqbegin(&rename_lock);
+ 		dentry = __d_lookup(parent, name);
+ 		if (dentry)
+ 			break;
+ 	} while (read_seqretry(&rename_lock, seq));
+ 	return dentry;
+ }
+ EXPORT_SYMBOL(d_lookup);
+ 
+ /**
+  * __d_lookup - search for a dentry (racy)
+  * @parent: parent dentry
+  * @name: qstr of name we wish to find
+  * Returns: dentry, or NULL
+  *
+  * __d_lookup is like d_lookup, however it may (rarely) return a
+  * false-negative result due to unrelated rename activity.
+  *
+  * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+  * however it must be used carefully, eg. with a following d_lookup in
+  * the case of failure.
+  *
+  * __d_lookup callers must be commented.
+  */
+ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
+ {
+ 	unsigned int hash = name->hash;
+ 	struct hlist_bl_head *b = d_hash(hash);
+ 	struct hlist_bl_node *node;
+ 	struct dentry *found = NULL;
+ 	struct dentry *dentry;
+ 
+ 	/*
+ 	 * Note: There is significant duplication with __d_lookup_rcu which is
+ 	 * required to prevent single threaded performance regressions
+ 	 * especially on architectures where smp_rmb (in seqcounts) are costly.
+ 	 * Keep the two functions in sync.
+ 	 */
+ 
+ 	/*
+ 	 * The hash list is protected using RCU.
+ 	 *
+ 	 * Take d_lock when comparing a candidate dentry, to avoid races
+ 	 * with d_move().
+ 	 *
+ 	 * It is possible that concurrent renames can mess up our list
+ 	 * walk here and result in missing our dentry, resulting in the
+ 	 * false-negative result. d_lookup() protects against concurrent
+ 	 * renames using rename_lock seqlock.
+ 	 *
+ 	 * See Documentation/filesystems/path-lookup.txt for more details.
+ 	 */
+ 	rcu_read_lock();
+ 	
+ 	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ 
+ 		if (dentry->d_name.hash != hash)
+ 			continue;
+ 
+ 		spin_lock(&dentry->d_lock);
+ 		if (dentry->d_parent != parent)
+ 			goto next;
+ 		if (d_unhashed(dentry))
+ 			goto next;
+ 
+ 		if (!d_same_name(dentry, parent, name))
+ 			goto next;
+ 
+ 		dentry->d_lockref.count++;
+ 		found = dentry;
+ 		spin_unlock(&dentry->d_lock);
+ 		break;
+ next:
+ 		spin_unlock(&dentry->d_lock);
+  	}
+  	rcu_read_unlock();
+ 
+  	return found;
+ }
+ 
+ /**
+  * d_hash_and_lookup - hash the qstr then search for a dentry
+  * @dir: Directory to search in
+  * @name: qstr of name we wish to find
+  *
+  * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
+  */
+ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
+ {
+ 	/*
+ 	 * Check for a fs-specific hash function. Note that we must
+ 	 * calculate the standard hash first, as the d_op->d_hash()
+ 	 * routine may choose to leave the hash value unchanged.
+ 	 */
+ 	name->hash = full_name_hash(dir, name->name, name->len);
+ 	if (dir->d_flags & DCACHE_OP_HASH) {
+ 		int err = dir->d_op->d_hash(dir, name);
+ 		if (unlikely(err < 0))
+ 			return ERR_PTR(err);
+ 	}
+ 	return d_lookup(dir, name);
+ }
+ EXPORT_SYMBOL(d_hash_and_lookup);
+ 
+ /*
+  * When a file is deleted, we have two options:
+  * - turn this dentry into a negative dentry
+  * - unhash this dentry and free it.
+  *
+  * Usually, we want to just turn this into
+  * a negative dentry, but if anybody else is
+  * currently using the dentry or the inode
+  * we can't do that and we fall back on removing
+  * it from the hash queues and waiting for
+  * it to be deleted later when it has no users
+  */
+  
+ /**
+  * d_delete - delete a dentry
+  * @dentry: The dentry to delete
+  *
+  * Turn the dentry into a negative dentry if possible, otherwise
+  * remove it from the hash queues so it can be deleted later
+  */
+  
+ void d_delete(struct dentry * dentry)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 
+ 	spin_lock(&inode->i_lock);
+ 	spin_lock(&dentry->d_lock);
+ 	/*
+ 	 * Are we the only user?
+ 	 */
+ 	if (dentry->d_lockref.count == 1) {
+ 		dentry->d_flags &= ~DCACHE_CANT_MOUNT;
+ 		dentry_unlink_inode(dentry);
+ 	} else {
+ 		__d_drop(dentry);
+ 		spin_unlock(&dentry->d_lock);
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ }
+ EXPORT_SYMBOL(d_delete);
+ 
+ static void __d_rehash(struct dentry *entry)
+ {
+ 	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
+ 
+ 	hlist_bl_lock(b);
+ 	hlist_bl_add_head_rcu(&entry->d_hash, b);
+ 	hlist_bl_unlock(b);
+ }
+ 
+ /**
+  * d_rehash	- add an entry back to the hash
+  * @entry: dentry to add to the hash
+  *
+  * Adds a dentry to the hash according to its name.
+  */
+  
+ void d_rehash(struct dentry * entry)
+ {
+ 	spin_lock(&entry->d_lock);
+ 	__d_rehash(entry);
+ 	spin_unlock(&entry->d_lock);
+ }
+ EXPORT_SYMBOL(d_rehash);
+ 
+ static inline unsigned start_dir_add(struct inode *dir)
+ {
+ 
+ 	for (;;) {
+ 		unsigned n = dir->i_dir_seq;
+ 		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ 			return n;
+ 		cpu_relax();
+ 	}
+ }
+ 
+ static inline void end_dir_add(struct inode *dir, unsigned n)
+ {
+ 	smp_store_release(&dir->i_dir_seq, n + 2);
+ }
+ 
+ static void d_wait_lookup(struct dentry *dentry)
+ {
+ 	if (d_in_lookup(dentry)) {
+ 		DECLARE_WAITQUEUE(wait, current);
+ 		add_wait_queue(dentry->d_wait, &wait);
+ 		do {
+ 			set_current_state(TASK_UNINTERRUPTIBLE);
+ 			spin_unlock(&dentry->d_lock);
+ 			schedule();
+ 			spin_lock(&dentry->d_lock);
+ 		} while (d_in_lookup(dentry));
+ 	}
+ }
+ 
+ struct dentry *d_alloc_parallel(struct dentry *parent,
+ 				const struct qstr *name,
+ 				wait_queue_head_t *wq)
+ {
+ 	unsigned int hash = name->hash;
+ 	struct hlist_bl_head *b = in_lookup_hash(parent, hash);
+ 	struct hlist_bl_node *node;
+ 	struct dentry *new = d_alloc(parent, name);
+ 	struct dentry *dentry;
+ 	unsigned seq, r_seq, d_seq;
+ 
+ 	if (unlikely(!new))
+ 		return ERR_PTR(-ENOMEM);
+ 
+ retry:
+ 	rcu_read_lock();
+ 	seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
+ 	r_seq = read_seqbegin(&rename_lock);
+ 	dentry = __d_lookup_rcu(parent, name, &d_seq);
+ 	if (unlikely(dentry)) {
+ 		if (!lockref_get_not_dead(&dentry->d_lockref)) {
+ 			rcu_read_unlock();
+ 			goto retry;
+ 		}
+ 		if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
+ 			rcu_read_unlock();
+ 			dput(dentry);
+ 			goto retry;
+ 		}
+ 		rcu_read_unlock();
+ 		dput(new);
+ 		return dentry;
+ 	}
+ 	if (unlikely(read_seqretry(&rename_lock, r_seq))) {
+ 		rcu_read_unlock();
+ 		goto retry;
+ 	}
+ 
+ 	if (unlikely(seq & 1)) {
+ 		rcu_read_unlock();
+ 		goto retry;
+ 	}
+ 
+ 	hlist_bl_lock(b);
+ 	if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
+ 		hlist_bl_unlock(b);
+ 		rcu_read_unlock();
+ 		goto retry;
+ 	}
+ 	/*
+ 	 * No changes for the parent since the beginning of d_lookup().
+ 	 * Since all removals from the chain happen with hlist_bl_lock(),
+ 	 * any potential in-lookup matches are going to stay here until
+ 	 * we unlock the chain.  All fields are stable in everything
+ 	 * we encounter.
+ 	 */
+ 	hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
+ 		if (dentry->d_name.hash != hash)
+ 			continue;
+ 		if (dentry->d_parent != parent)
+ 			continue;
+ 		if (!d_same_name(dentry, parent, name))
+ 			continue;
+ 		hlist_bl_unlock(b);
+ 		/* now we can try to grab a reference */
+ 		if (!lockref_get_not_dead(&dentry->d_lockref)) {
+ 			rcu_read_unlock();
+ 			goto retry;
+ 		}
+ 
+ 		rcu_read_unlock();
+ 		/*
+ 		 * somebody is likely to be still doing lookup for it;
+ 		 * wait for them to finish
+ 		 */
+ 		spin_lock(&dentry->d_lock);
+ 		d_wait_lookup(dentry);
+ 		/*
+ 		 * it's not in-lookup anymore; in principle we should repeat
+ 		 * everything from dcache lookup, but it's likely to be what
+ 		 * d_lookup() would've found anyway.  If it is, just return it;
+ 		 * otherwise we really have to repeat the whole thing.
+ 		 */
+ 		if (unlikely(dentry->d_name.hash != hash))
+ 			goto mismatch;
+ 		if (unlikely(dentry->d_parent != parent))
+ 			goto mismatch;
+ 		if (unlikely(d_unhashed(dentry)))
+ 			goto mismatch;
+ 		if (unlikely(!d_same_name(dentry, parent, name)))
+ 			goto mismatch;
+ 		/* OK, it *is* a hashed match; return it */
+ 		spin_unlock(&dentry->d_lock);
+ 		dput(new);
+ 		return dentry;
+ 	}
+ 	rcu_read_unlock();
+ 	/* we can't take ->d_lock here; it's OK, though. */
+ 	new->d_flags |= DCACHE_PAR_LOOKUP;
+ 	new->d_wait = wq;
+ 	hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
+ 	hlist_bl_unlock(b);
+ 	return new;
+ mismatch:
+ 	spin_unlock(&dentry->d_lock);
+ 	dput(dentry);
+ 	goto retry;
+ }
+ EXPORT_SYMBOL(d_alloc_parallel);
+ 
+ void __d_lookup_done(struct dentry *dentry)
+ {
+ 	struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent,
+ 						 dentry->d_name.hash);
+ 	hlist_bl_lock(b);
+ 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
+ 	__hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
+ 	wake_up_all(dentry->d_wait);
+ 	dentry->d_wait = NULL;
+ 	hlist_bl_unlock(b);
+ 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
+ 	INIT_LIST_HEAD(&dentry->d_lru);
+ }
+ EXPORT_SYMBOL(__d_lookup_done);
+ 
+ /* inode->i_lock held if inode is non-NULL */
+ 
+ static inline void __d_add(struct dentry *dentry, struct inode *inode)
+ {
+ 	struct inode *dir = NULL;
+ 	unsigned n;
+ 	spin_lock(&dentry->d_lock);
+ 	if (unlikely(d_in_lookup(dentry))) {
+ 		dir = dentry->d_parent->d_inode;
+ 		n = start_dir_add(dir);
+ 		__d_lookup_done(dentry);
+ 	}
+ 	if (inode) {
+ 		unsigned add_flags = d_flags_for_inode(inode);
+ 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ 		raw_write_seqcount_begin(&dentry->d_seq);
+ 		__d_set_inode_and_type(dentry, inode, add_flags);
+ 		raw_write_seqcount_end(&dentry->d_seq);
+ 		fsnotify_update_flags(dentry);
+ 	}
+ 	__d_rehash(dentry);
+ 	if (dir)
+ 		end_dir_add(dir, n);
+ 	spin_unlock(&dentry->d_lock);
+ 	if (inode)
+ 		spin_unlock(&inode->i_lock);
+ }
+ 
+ /**
+  * d_add - add dentry to hash queues
+  * @entry: dentry to add
+  * @inode: The inode to attach to this dentry
+  *
+  * This adds the entry to the hash queues and initializes @inode.
+  * The entry was actually filled in earlier during d_alloc().
+  */
+ 
+ void d_add(struct dentry *entry, struct inode *inode)
+ {
+ 	if (inode) {
+ 		security_d_instantiate(entry, inode);
+ 		spin_lock(&inode->i_lock);
+ 	}
+ 	__d_add(entry, inode);
+ }
+ EXPORT_SYMBOL(d_add);
+ 
+ /**
+  * d_exact_alias - find and hash an exact unhashed alias
+  * @entry: dentry to add
+  * @inode: The inode to go with this dentry
+  *
+  * If an unhashed dentry with the same name/parent and desired
+  * inode already exists, hash and return it.  Otherwise, return
+  * NULL.
+  *
+  * Parent directory should be locked.
+  */
+ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
+ {
+ 	struct dentry *alias;
+ 	unsigned int hash = entry->d_name.hash;
+ 
+ 	spin_lock(&inode->i_lock);
+ 	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ 		/*
+ 		 * Don't need alias->d_lock here, because aliases with
+ 		 * d_parent == entry->d_parent are not subject to name or
+ 		 * parent changes, because the parent inode i_mutex is held.
+ 		 */
+ 		if (alias->d_name.hash != hash)
+ 			continue;
+ 		if (alias->d_parent != entry->d_parent)
+ 			continue;
+ 		if (!d_same_name(alias, entry->d_parent, &entry->d_name))
+ 			continue;
+ 		spin_lock(&alias->d_lock);
+ 		if (!d_unhashed(alias)) {
+ 			spin_unlock(&alias->d_lock);
+ 			alias = NULL;
+ 		} else {
+ 			__dget_dlock(alias);
+ 			__d_rehash(alias);
+ 			spin_unlock(&alias->d_lock);
+ 		}
+ 		spin_unlock(&inode->i_lock);
+ 		return alias;
+ 	}
+ 	spin_unlock(&inode->i_lock);
+ 	return NULL;
+ }
+ EXPORT_SYMBOL(d_exact_alias);
+ 
+ static void swap_names(struct dentry *dentry, struct dentry *target)
+ {
+ 	if (unlikely(dname_external(target))) {
+ 		if (unlikely(dname_external(dentry))) {
+ 			/*
+ 			 * Both external: swap the pointers
+ 			 */
+ 			swap(target->d_name.name, dentry->d_name.name);
+ 		} else {
+ 			/*
+ 			 * dentry:internal, target:external.  Steal target's
+ 			 * storage and make target internal.
+ 			 */
+ 			memcpy(target->d_iname, dentry->d_name.name,
+ 					dentry->d_name.len + 1);
+ 			dentry->d_name.name = target->d_name.name;
+ 			target->d_name.name = target->d_iname;
+ 		}
+ 	} else {
+ 		if (unlikely(dname_external(dentry))) {
+ 			/*
+ 			 * dentry:external, target:internal.  Give dentry's
+ 			 * storage to target and make dentry internal
+ 			 */
+ 			memcpy(dentry->d_iname, target->d_name.name,
+ 					target->d_name.len + 1);
+ 			target->d_name.name = dentry->d_name.name;
+ 			dentry->d_name.name = dentry->d_iname;
+ 		} else {
+ 			/*
+ 			 * Both are internal.
+ 			 */
+ 			unsigned int i;
+ 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+ 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
+ 				swap(((long *) &dentry->d_iname)[i],
+ 				     ((long *) &target->d_iname)[i]);
+ 			}
+ 		}
+ 	}
+ 	swap(dentry->d_name.hash_len, target->d_name.hash_len);
+ }
+ 
+ static void copy_name(struct dentry *dentry, struct dentry *target)
+ {
+ 	struct external_name *old_name = NULL;
+ 	if (unlikely(dname_external(dentry)))
+ 		old_name = external_name(dentry);
+ 	if (unlikely(dname_external(target))) {
+ 		atomic_inc(&external_name(target)->u.count);
+ 		dentry->d_name = target->d_name;
+ 	} else {
+ 		memcpy(dentry->d_iname, target->d_name.name,
+ 				target->d_name.len + 1);
+ 		dentry->d_name.name = dentry->d_iname;
+ 		dentry->d_name.hash_len = target->d_name.hash_len;
+ 	}
+ 	if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
+ 		kfree_rcu(old_name, u.head);
+ }
+ 
+ /*
+  * __d_move - move a dentry
+  * @dentry: entry to move
+  * @target: new dentry
+  * @exchange: exchange the two dentries
+  *
+  * Update the dcache to reflect the move of a file name. Negative
+  * dcache entries should not be moved in this way. Caller must hold
+  * rename_lock, the i_mutex of the source and target directories,
+  * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
+  */
+ static void __d_move(struct dentry *dentry, struct dentry *target,
+ 		     bool exchange)
+ {
+ 	struct dentry *old_parent, *p;
+ 	struct inode *dir = NULL;
+ 	unsigned n;
+ 
+ 	WARN_ON(!dentry->d_inode);
+ 	if (WARN_ON(dentry == target))
+ 		return;
+ 
+ 	BUG_ON(d_ancestor(target, dentry));
+ 	old_parent = dentry->d_parent;
+ 	p = d_ancestor(old_parent, target);
+ 	if (IS_ROOT(dentry)) {
+ 		BUG_ON(p);
+ 		spin_lock(&target->d_parent->d_lock);
+ 	} else if (!p) {
+ 		/* target is not a descendent of dentry->d_parent */
+ 		spin_lock(&target->d_parent->d_lock);
+ 		spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
+ 	} else {
+ 		BUG_ON(p == dentry);
+ 		spin_lock(&old_parent->d_lock);
+ 		if (p != target)
+ 			spin_lock_nested(&target->d_parent->d_lock,
+ 					DENTRY_D_LOCK_NESTED);
+ 	}
+ 	spin_lock_nested(&dentry->d_lock, 2);
+ 	spin_lock_nested(&target->d_lock, 3);
+ 
+ 	if (unlikely(d_in_lookup(target))) {
+ 		dir = target->d_parent->d_inode;
+ 		n = start_dir_add(dir);
+ 		__d_lookup_done(target);
+ 	}
+ 
+ 	write_seqcount_begin(&dentry->d_seq);
+ 	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
+ 
+ 	/* unhash both */
+ 	if (!d_unhashed(dentry))
+ 		___d_drop(dentry);
+ 	if (!d_unhashed(target))
+ 		___d_drop(target);
+ 
+ 	/* ... and switch them in the tree */
+ 	dentry->d_parent = target->d_parent;
+ 	if (!exchange) {
+ 		copy_name(dentry, target);
+ 		target->d_hash.pprev = NULL;
+ 		dentry->d_parent->d_lockref.count++;
+ 		if (dentry != old_parent) /* wasn't IS_ROOT */
+ 			WARN_ON(!--old_parent->d_lockref.count);
+ 	} else {
+ 		target->d_parent = old_parent;
+ 		swap_names(dentry, target);
+ 		list_move(&target->d_child, &target->d_parent->d_subdirs);
+ 		__d_rehash(target);
+ 		fsnotify_update_flags(target);
+ 	}
+ 	list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+ 	__d_rehash(dentry);
+ 	fsnotify_update_flags(dentry);
+ 	fscrypt_handle_d_move(dentry);
+ 
+ 	write_seqcount_end(&target->d_seq);
+ 	write_seqcount_end(&dentry->d_seq);
+ 
+ 	if (dir)
+ 		end_dir_add(dir, n);
+ 
+ 	if (dentry->d_parent != old_parent)
+ 		spin_unlock(&dentry->d_parent->d_lock);
+ 	if (dentry != old_parent)
+ 		spin_unlock(&old_parent->d_lock);
+ 	spin_unlock(&target->d_lock);
+ 	spin_unlock(&dentry->d_lock);
+ }
+ 
+ /*
+  * d_move - move a dentry
+  * @dentry: entry to move
+  * @target: new dentry
+  *
+  * Update the dcache to reflect the move of a file name. Negative
+  * dcache entries should not be moved in this way. See the locking
+  * requirements for __d_move.
+  */
+ void d_move(struct dentry *dentry, struct dentry *target)
+ {
+ 	write_seqlock(&rename_lock);
+ 	__d_move(dentry, target, false);
+ 	write_sequnlock(&rename_lock);
+ }
+ EXPORT_SYMBOL(d_move);
+ 
+ /*
+  * d_exchange - exchange two dentries
+  * @dentry1: first dentry
+  * @dentry2: second dentry
+  */
+ void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
+ {
+ 	write_seqlock(&rename_lock);
+ 
+ 	WARN_ON(!dentry1->d_inode);
+ 	WARN_ON(!dentry2->d_inode);
+ 	WARN_ON(IS_ROOT(dentry1));
+ 	WARN_ON(IS_ROOT(dentry2));
+ 
+ 	__d_move(dentry1, dentry2, true);
+ 
+ 	write_sequnlock(&rename_lock);
+ }
+ 
+ /**
+  * d_ancestor - search for an ancestor
+  * @p1: ancestor dentry
+  * @p2: child dentry
+  *
+  * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
+  * an ancestor of p2, else NULL.
+  */
+ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
+ {
+ 	struct dentry *p;
+ 
+ 	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
+ 		if (p->d_parent == p1)
+ 			return p;
+ 	}
+ 	return NULL;
+ }
+ 
+ /*
+  * This helper attempts to cope with remotely renamed directories
+  *
+  * It assumes that the caller is already holding
+  * dentry->d_parent->d_inode->i_mutex, and rename_lock
+  *
+  * Note: If ever the locking in lock_rename() changes, then please
+  * remember to update this too...
+  */
+ static int __d_unalias(struct inode *inode,
+ 		struct dentry *dentry, struct dentry *alias)
+ {
+ 	struct mutex *m1 = NULL;
+ 	struct rw_semaphore *m2 = NULL;
+ 	int ret = -ESTALE;
+ 
+ 	/* If alias and dentry share a parent, then no extra locks required */
+ 	if (alias->d_parent == dentry->d_parent)
+ 		goto out_unalias;
+ 
+ 	/* See lock_rename() */
+ 	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
+ 		goto out_err;
+ 	m1 = &dentry->d_sb->s_vfs_rename_mutex;
+ 	if (!inode_trylock_shared(alias->d_parent->d_inode))
+ 		goto out_err;
+ 	m2 = &alias->d_parent->d_inode->i_rwsem;
+ out_unalias:
+ 	__d_move(alias, dentry, false);
+ 	ret = 0;
+ out_err:
+ 	if (m2)
+ 		up_read(m2);
+ 	if (m1)
+ 		mutex_unlock(m1);
+ 	return ret;
+ }
+ 
+ /**
+  * d_splice_alias - splice a disconnected dentry into the tree if one exists
+  * @inode:  the inode which may have a disconnected dentry
+  * @dentry: a negative dentry which we want to point to the inode.
+  *
+  * If inode is a directory and has an IS_ROOT alias, then d_move that in
+  * place of the given dentry and return it, else simply d_add the inode
+  * to the dentry and return NULL.
+  *
+  * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+  * we should error out: directories can't have multiple aliases.
+  *
+  * This is needed in the lookup routine of any filesystem that is exportable
+  * (via knfsd) so that we can build dcache paths to directories effectively.
+  *
+  * If a dentry was found and moved, then it is returned.  Otherwise NULL
+  * is returned.  This matches the expected return value of ->lookup.
+  *
+  * Cluster filesystems may call this function with a negative, hashed dentry.
+  * In that case, we know that the inode will be a regular file, and also this
+  * will only occur during atomic_open. So we need to check for the dentry
+  * being already hashed only in the final case.
+  */
+ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+ {
+ 	if (IS_ERR(inode))
+ 		return ERR_CAST(inode);
+ 
+ 	BUG_ON(!d_unhashed(dentry));
+ 
+ 	if (!inode)
+ 		goto out;
+ 
+ 	security_d_instantiate(dentry, inode);
+ 	spin_lock(&inode->i_lock);
+ 	if (S_ISDIR(inode->i_mode)) {
+ 		struct dentry *new = __d_find_any_alias(inode);
+ 		if (unlikely(new)) {
+ 			/* The reference to new ensures it remains an alias */
+ 			spin_unlock(&inode->i_lock);
+ 			write_seqlock(&rename_lock);
+ 			if (unlikely(d_ancestor(new, dentry))) {
+ 				write_sequnlock(&rename_lock);
+ 				dput(new);
+ 				new = ERR_PTR(-ELOOP);
+ 				pr_warn_ratelimited(
+ 					"VFS: Lookup of '%s' in %s %s"
+ 					" would have caused loop\n",
+ 					dentry->d_name.name,
+ 					inode->i_sb->s_type->name,
+ 					inode->i_sb->s_id);
+ 			} else if (!IS_ROOT(new)) {
+ 				struct dentry *old_parent = dget(new->d_parent);
+ 				int err = __d_unalias(inode, dentry, new);
+ 				write_sequnlock(&rename_lock);
+ 				if (err) {
+ 					dput(new);
+ 					new = ERR_PTR(err);
+ 				}
+ 				dput(old_parent);
+ 			} else {
+ 				__d_move(new, dentry, false);
+ 				write_sequnlock(&rename_lock);
+ 			}
+ 			iput(inode);
+ 			return new;
+ 		}
+ 	}
+ out:
+ 	__d_add(dentry, inode);
+ 	return NULL;
+ }
+ EXPORT_SYMBOL(d_splice_alias);
+ 
+ /*
+  * Test whether new_dentry is a subdirectory of old_dentry.
+  *
+  * Trivially implemented using the dcache structure
+  */
+ 
+ /**
+  * is_subdir - is new dentry a subdirectory of old_dentry
+  * @new_dentry: new dentry
+  * @old_dentry: old dentry
+  *
+  * Returns true if new_dentry is a subdirectory of the parent (at any depth).
+  * Returns false otherwise.
+  * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
+  */
+   
+ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+ {
+ 	bool result;
+ 	unsigned seq;
+ 
+ 	if (new_dentry == old_dentry)
+ 		return true;
+ 
+ 	do {
+ 		/* for restarting inner loop in case of seq retry */
+ 		seq = read_seqbegin(&rename_lock);
+ 		/*
+ 		 * Need rcu_readlock to protect against the d_parent trashing
+ 		 * due to d_move
+ 		 */
+ 		rcu_read_lock();
+ 		if (d_ancestor(old_dentry, new_dentry))
+ 			result = true;
+ 		else
+ 			result = false;
+ 		rcu_read_unlock();
+ 	} while (read_seqretry(&rename_lock, seq));
+ 
+ 	return result;
+ }
+ EXPORT_SYMBOL(is_subdir);
+ 
+ static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
+ {
+ 	struct dentry *root = data;
+ 	if (dentry != root) {
+ 		if (d_unhashed(dentry) || !dentry->d_inode)
+ 			return D_WALK_SKIP;
+ 
+ 		if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+ 			dentry->d_flags |= DCACHE_GENOCIDE;
+ 			dentry->d_lockref.count--;
+ 		}
+ 	}
+ 	return D_WALK_CONTINUE;
+ }
+ 
+ void d_genocide(struct dentry *parent)
+ {
+ 	d_walk(parent, parent, d_genocide_kill);
+ }
+ 
+ EXPORT_SYMBOL(d_genocide);
+ 
+ void d_tmpfile(struct dentry *dentry, struct inode *inode)
+ {
+ 	inode_dec_link_count(inode);
+ 	BUG_ON(dentry->d_name.name != dentry->d_iname ||
+ 		!hlist_unhashed(&dentry->d_u.d_alias) ||
+ 		!d_unlinked(dentry));
+ 	spin_lock(&dentry->d_parent->d_lock);
+ 	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ 	dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+ 				(unsigned long long)inode->i_ino);
+ 	spin_unlock(&dentry->d_lock);
+ 	spin_unlock(&dentry->d_parent->d_lock);
+ 	d_instantiate(dentry, inode);
+ }
+ EXPORT_SYMBOL(d_tmpfile);
+ 
+ static __initdata unsigned long dhash_entries;
+ static int __init set_dhash_entries(char *str)
+ {
+ 	if (!str)
+ 		return 0;
+ 	dhash_entries = simple_strtoul(str, &str, 0);
+ 	return 1;
+ }
+ __setup("dhash_entries=", set_dhash_entries);
+ 
+ static void __init dcache_init_early(void)
+ {
+ 	/* If hashes are distributed across NUMA nodes, defer
+ 	 * hash allocation until vmalloc space is available.
+ 	 */
+ 	if (hashdist)
+ 		return;
+ 
+ 	dentry_hashtable =
+ 		alloc_large_system_hash("Dentry cache",
+ 					sizeof(struct hlist_bl_head),
+ 					dhash_entries,
+ 					13,
+ 					HASH_EARLY | HASH_ZERO,
+ 					&d_hash_shift,
+ 					NULL,
+ 					0,
+ 					0);
+ 	d_hash_shift = 32 - d_hash_shift;
+ }
+ 
+ static void __init dcache_init(void)
+ {
+ 	/*
+ 	 * A constructor could be added for stable state like the lists,
+ 	 * but it is probably not worth it because of the cache nature
+ 	 * of the dcache.
+ 	 */
+ 	dentry_cache = KMEM_CACHE_USERCOPY(dentry,
+ 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+ 		d_iname);
+ 
+ 	/* Hash may have been set up in dcache_init_early */
+ 	if (!hashdist)
+ 		return;
+ 
+ 	dentry_hashtable =
+ 		alloc_large_system_hash("Dentry cache",
+ 					sizeof(struct hlist_bl_head),
+ 					dhash_entries,
+ 					13,
+ 					HASH_ZERO,
+ 					&d_hash_shift,
+ 					NULL,
+ 					0,
+ 					0);
+ 	d_hash_shift = 32 - d_hash_shift;
+ }
+ 
+ /* SLAB cache for __getname() consumers */
+ struct kmem_cache *names_cachep __read_mostly;
+ EXPORT_SYMBOL(names_cachep);
+ 
+ void __init vfs_caches_init_early(void)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
+ 		INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
+ 
+ 	dcache_init_early();
+ 	inode_init_early();
+ }
+ 
+ void __init vfs_caches_init(void)
+ {
+ 	names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
+ 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);
+ 
+ 	dcache_init();
+ 	inode_init();
+ 	files_init();
+ 	files_maxfiles_init();
+ 	mnt_init();
+ 	bdev_cache_init();
+ 	chrdev_init();
+ }
diff --color -rcNP Master/fs/dcache.c.rej OG/fs/dcache.c.rej
*** Master/fs/dcache.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/dcache.c.rej	2021-04-20 15:11:27.314000000 -0400
***************
*** 0 ****
--- 1,203 ----
+ *** fs/dcache.c	2021-03-13 14:02:45.000000000 +0200
+ --- fs/dcache.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 268,274 ****
+   {
+   	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+   
+ ! 	kmem_cache_free(dentry_cache, dentry);
+   }
+   
+   static void __d_free_external(struct rcu_head *head)
+ --- 268,274 ----
+   {
+   	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+   
+ ! 	kmem_cache_free(dentry_cache, dentry);
+   }
+   
+   static void __d_free_external(struct rcu_head *head)
+ ***************
+ *** 811,817 ****
+   }
+   
+   
+ ! /*
+    * This is dput
+    *
+    * This is complicated by the fact that we do not want to put
+ --- 811,817 ----
+   }
+   
+   
+ ! /*
+    * This is dput
+    *
+    * This is complicated by the fact that we do not want to put
+ ***************
+ *** 830,836 ****
+   
+   /*
+    * dput - release a dentry
+ !  * @dentry: dentry to release
+    *
+    * Release a dentry. This will drop the usage count and if appropriate
+    * call the dentry unlink method as well as removing it from the queues and
+ --- 830,836 ----
+   
+   /*
+    * dput - release a dentry
+ !  * @dentry: dentry to release
+    *
+    * Release a dentry. This will drop the usage count and if appropriate
+    * call the dentry unlink method as well as removing it from the queues and
+ ***************
+ *** 1680,1686 ****
+    * available. On a success the dentry is returned. The name passed in is
+    * copied and the copy passed in may be reused after this call.
+    */
+ ! 
+   struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
+   {
+   	struct dentry *dentry;
+ --- 1680,1686 ----
+    * available. On a success the dentry is returned. The name passed in is
+    * copied and the copy passed in may be reused after this call.
+    */
+ ! 
+   struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
+   {
+   	struct dentry *dentry;
+ ***************
+ *** 1707,1720 ****
+   						  GFP_KERNEL_ACCOUNT |
+   						  __GFP_RECLAIMABLE);
+   		if (!p) {
+ ! 			kmem_cache_free(dentry_cache, dentry);
+   			return NULL;
+   		}
+   		atomic_set(&p->u.count, 1);
+   		dname = p->name;
+   	} else  {
+   		dname = dentry->d_iname;
+ ! 	}
+   
+   	dentry->d_name.len = name->len;
+   	dentry->d_name.hash = name->hash;
+ --- 1707,1720 ----
+   						  GFP_KERNEL_ACCOUNT |
+   						  __GFP_RECLAIMABLE);
+   		if (!p) {
+ ! 			kmem_cache_free(dentry_cache, dentry);
+   			return NULL;
+   		}
+   		atomic_set(&p->u.count, 1);
+   		dname = p->name;
+   	} else  {
+   		dname = dentry->d_iname;
+ ! 	}
+   
+   	dentry->d_name.len = name->len;
+   	dentry->d_name.hash = name->hash;
+ ***************
+ *** 1946,1952 ****
+    * (or otherwise set) by the caller to indicate that it is now
+    * in use by the dcache.
+    */
+ ! 
+   void d_instantiate(struct dentry *entry, struct inode * inode)
+   {
+   	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+ --- 1943,1949 ----
+    * (or otherwise set) by the caller to indicate that it is now
+    * in use by the dcache.
+    */
+ ! 
+   void d_instantiate(struct dentry *entry, struct inode * inode)
+   {
+   	BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
+ ***************
+ *** 2156,2162 ****
+   		if (!found) {
+   			iput(inode);
+   			return ERR_PTR(-ENOMEM);
+ ! 		}
+   	}
+   	res = d_splice_alias(inode, found);
+   	if (res) {
+ --- 2153,2159 ----
+   		if (!found) {
+   			iput(inode);
+   			return ERR_PTR(-ENOMEM);
+ ! 		}
+   	}
+   	res = d_splice_alias(inode, found);
+   	if (res) {
+ ***************
+ *** 2365,2371 ****
+   	 * See Documentation/filesystems/path-lookup.txt for more details.
+   	 */
+   	rcu_read_lock();
+ ! 
+   	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+   
+   		if (dentry->d_name.hash != hash)
+ --- 2362,2368 ----
+   	 * See Documentation/filesystems/path-lookup.txt for more details.
+   	 */
+   	rcu_read_lock();
+ ! 
+   	hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+   
+   		if (dentry->d_name.hash != hash)
+ ***************
+ *** 2436,2442 ****
+    * Turn the dentry into a negative dentry if possible, otherwise
+    * remove it from the hash queues so it can be deleted later
+    */
+ ! 
+   void d_delete(struct dentry * dentry)
+   {
+   	struct inode *inode = dentry->d_inode;
+ --- 2433,2439 ----
+    * Turn the dentry into a negative dentry if possible, otherwise
+    * remove it from the hash queues so it can be deleted later
+    */
+ ! 
+   void d_delete(struct dentry * dentry)
+   {
+   	struct inode *inode = dentry->d_inode;
+ ***************
+ *** 2472,2478 ****
+    *
+    * Adds a dentry to the hash according to its name.
+    */
+ ! 
+   void d_rehash(struct dentry * entry)
+   {
+   	spin_lock(&entry->d_lock);
+ --- 2469,2475 ----
+    *
+    * Adds a dentry to the hash according to its name.
+    */
+ ! 
+   void d_rehash(struct dentry * entry)
+   {
+   	spin_lock(&entry->d_lock);
+ ***************
+ *** 3063,3069 ****
+    * Returns false otherwise.
+    * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
+    */
+ ! 
+   bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+   {
+   	bool result;
+ --- 3060,3066 ----
+    * Returns false otherwise.
+    * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
+    */
+ ! 
+   bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+   {
+   	bool result;
diff --color -rcNP Master/fs/debugfs/inode.c OG/fs/debugfs/inode.c
*** Master/fs/debugfs/inode.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/debugfs/inode.c	2021-04-20 15:11:34.506000000 -0400
***************
*** 547,552 ****
--- 547,557 ----
   * If debugfs is not enabled in the kernel, the value -%ENODEV will be
   * returned.
   */
+ 
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ extern int grsec_enable_sysfs_restrict;
+ #endif
+ 
  struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
  {
  	struct dentry *dentry = start_creating(name, parent);
***************
*** 561,568 ****
  		       name);
  		return failed_creating(dentry);
  	}
! 
! 	inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
  	inode->i_op = &debugfs_dir_inode_operations;
  	inode->i_fop = &simple_dir_operations;
  
--- 566,578 ----
  		       name);
  		return failed_creating(dentry);
  	}
! 	
! #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
! 	if (grsec_enable_sysfs_restrict)
! 		inode->i_mode = S_IFDIR | S_IRWXU;
! 	else
! #endif
! 		inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
  	inode->i_op = &debugfs_dir_inode_operations;
  	inode->i_fop = &simple_dir_operations;
  
***************
*** 913,916 ****
  	return retval;
  }
  core_initcall(debugfs_init);
- 
--- 923,925 ----
diff --color -rcNP Master/fs/exec.c OG/fs/exec.c
*** Master/fs/exec.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/exec.c	2021-04-20 15:11:34.507000000 -0400
***************
*** 65,72 ****
--- 65,81 ----
  #include <linux/compat.h>
  #include <linux/vmalloc.h>
  #include <linux/random.h>
+ #include <linux/seq_file.h>
+ #include <linux/coredump.h>
+ #include <linux/mman.h>
+ 
+ #ifdef CONFIG_MINISEC
+ #include <linux/minisec.h>
+ #endif
+ #include <linux/random.h>
  
  #include <linux/uaccess.h>
+ #include <asm/sections.h>
  #include <asm/mmu_context.h>
  #include <asm/tlb.h>
  
***************
*** 278,283 ****
--- 287,298 ----
  	arch_bprm_mm_init(mm, vma);
  	up_write(&mm->mmap_sem);
  	bprm->p = vma->vm_end - sizeof(void *);
+ 
+ #ifdef CONFIG_MINISEC_RANDUSTACK
+ 	if (randomize_va_space)
+ 		bprm->p ^= prandom_u32() & ~PAGE_MASK;
+ #endif
+ 
  	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
  		bprm->p ^= get_random_int() & ~PAGE_MASK;
  	return 0;
***************
*** 748,753 ****
--- 763,780 ----
  
  	vm_flags = VM_STACK_FLAGS;
  
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ 	if (mm->pax_flags & MF_PAX_PAGEEXEC) {
+ 		vm_flags &= ~VM_EXEC;
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 		if (mm->pax_flags & MF_PAX_MPROTECT)
+ 			vm_flags &= ~VM_MAYEXEC;
+ #endif
+ 
+ 	}
+ #endif
+ 
  	/*
  	 * Adjust stack execute permissions; explicitly enable for
  	 * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
***************
*** 756,762 ****
--- 783,794 ----
  	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
  		vm_flags |= VM_EXEC;
  	else if (executable_stack == EXSTACK_DISABLE_X)
+ 	{
  		vm_flags &= ~VM_EXEC;
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 		vm_flags &= ~VM_MAYEXEC;
+ #endif
+ 	}
  	vm_flags |= mm->def_flags;
  	vm_flags |= VM_STACK_INCOMPLETE_SETUP;
  
***************
*** 2028,2030 ****
--- 2060,2193 ----
  				  argv, envp, flags);
  }
  #endif
+ 
+ int pax_check_flags(unsigned long *flags)
+ {
+ 	int retval = 0;
+ 
+ #if !defined(CONFIG_X86_32) || !defined(CONFIG_MINISEC_SEGMEXEC)
+ 	if (*flags & MF_PAX_SEGMEXEC)
+ 	{
+ 		*flags &= ~MF_PAX_SEGMEXEC;
+ 		retval = -EINVAL;
+ 	}
+ #endif
+ 
+ 	if ((*flags & MF_PAX_PAGEEXEC)
+ 
+ #ifdef CONFIG_MINISEC_PAGEEXEC
+ 	    &&  (*flags & MF_PAX_SEGMEXEC)
+ #endif
+ 
+ 	   )
+ 	{
+ 		*flags &= ~MF_PAX_PAGEEXEC;
+ 		retval = -EINVAL;
+ 	}
+ 
+ 	if ((*flags & MF_PAX_MPROTECT)
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	    && !(*flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC))
+ #endif
+ 
+ 	   )
+ 	{
+ 		*flags &= ~MF_PAX_MPROTECT;
+ 		retval = -EINVAL;
+ 	}
+ 
+ 	if ((*flags & MF_PAX_EMUTRAMP)
+ 
+ #ifdef CONFIG_MINISEC_EMUTRAMP
+ 	    && !(*flags & (MF_PAX_PAGEEXEC | MF_PAX_SEGMEXEC))
+ #endif
+ 
+ 	   )
+ 	{
+ 		*flags &= ~MF_PAX_EMUTRAMP;
+ 		retval = -EINVAL;
+ 	}
+ 
+ 	return retval;
+ }
+ 
+ EXPORT_SYMBOL(pax_check_flags);
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ char *pax_get_path(const struct path *path, char *buf, int buflen)
+ {
+ 	char *pathname = d_path(path, buf, buflen);
+ 
+ 	if (IS_ERR(pathname))
+ 		goto toolong;
+ 
+ 	pathname = mangle_path(buf, pathname, "\t\n\\");
+ 	if (!pathname)
+ 		goto toolong;
+ 
+ 	*pathname = 0;
+ 	return buf;
+ 
+ toolong:
+ 	return "<path too long>";
+ }
+ EXPORT_SYMBOL(pax_get_path);
+ 
+ void pax_report_fault(struct pt_regs *regs, void *pc, void *sp)
+ {
+ 	struct task_struct *tsk = current;
+ 	struct mm_struct *mm = current->mm;
+ 	char *buffer_exec = (char *)__get_free_page(GFP_KERNEL);
+ 	char *buffer_fault = (char *)__get_free_page(GFP_KERNEL);
+ 	char *path_exec = NULL;
+ 	char *path_fault = NULL;
+ 	unsigned long start = 0UL, end = 0UL, offset = 0UL;
+ 	kernel_siginfo_t info = { };
+ 
+ 	if (buffer_exec && buffer_fault) {
+ 		struct vm_area_struct *vma, *vma_exec = NULL, *vma_fault = NULL;
+ 
+ 		//down_read(&mm->mmap_sem);
+ 		vma = mm->mmap;
+ 		while (vma && (!vma_exec || !vma_fault)) {
+ 			if (vma->vm_file && mm->exe_file == vma->vm_file && (vma->vm_flags & VM_EXEC))
+ 				vma_exec = vma;
+ 			if (vma->vm_start <= (unsigned long)pc && (unsigned long)pc < vma->vm_end)
+ 				vma_fault = vma;
+ 			vma = vma->vm_next;
+ 		}
+ 		if (vma_exec)
+ 			path_exec = pax_get_path(&vma_exec->vm_file->f_path, buffer_exec, PAGE_SIZE);
+ 		if (vma_fault) {
+ 			start = vma_fault->vm_start;
+ 			end = vma_fault->vm_end;
+ 			offset = vma_fault->vm_pgoff << PAGE_SHIFT;
+ 			if (vma_fault->vm_file)
+ 				path_fault = pax_get_path(&vma_fault->vm_file->f_path, buffer_fault, PAGE_SIZE);
+ 			else if ((unsigned long)pc >= mm->start_brk && (unsigned long)pc < mm->brk)
+ 				path_fault = "<heap>";
+ 			else if (vma_fault->vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
+ 				path_fault = "<stack>";
+ 			else
+ 				path_fault = "<anonymous mapping>";
+ 		}
+ 		//up_read(&mm->mmap_sem);
+ 	}
+ 	if (tsk->signal->curr_ip)
+ 		printk(KERN_ERR "PAX: From %pI4: execution attempt in: %s, %08lx-%08lx %08lx\n", &tsk->signal->curr_ip, path_fault, start, end, offset);
+ 	else
+ 		printk(KERN_ERR "PAX: execution attempt in: %s, %08lx-%08lx %08lx\n", path_fault, start, end, offset);
+ 	printk(KERN_ERR "PAX: terminating task: %s(%s):%d, uid/euid: %u/%u, PC: %p, SP: %p\n", path_exec, tsk->comm, task_pid_nr(tsk),
+ 			from_kuid_munged(&init_user_ns, task_uid(tsk)), from_kuid_munged(&init_user_ns, task_euid(tsk)), pc, sp);
+ 	free_page((unsigned long)buffer_exec);
+ 	free_page((unsigned long)buffer_fault);
+ 	pax_report_insns(regs, pc, sp);
+ 	info.si_signo = SIGKILL;
+ 	info.si_errno = 0;
+ 	info.si_code = SI_KERNEL;
+ 	info.si_pid = 0;
+ 	info.si_uid = 0;
+ 	do_coredump(&info);
+ }
+ #endif
diff --color -rcNP Master/fs/fcntl.c OG/fs/fcntl.c
*** Master/fs/fcntl.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/fcntl.c	2021-04-20 15:11:34.507000000 -0400
***************
*** 25,30 ****
--- 25,31 ----
  #include <linux/user_namespace.h>
  #include <linux/memfd.h>
  #include <linux/compat.h>
+ #include <linux/minisec.h>
  
  #include <linux/poll.h>
  #include <asm/siginfo.h>
***************
*** 106,111 ****
--- 107,114 ----
  		int force)
  {
  	security_file_set_fowner(filp);
+ 	if (gr_handle_chroot_fowner(pid, type))
+ 		return;
  	f_modown(filp, pid, type, force);
  }
  EXPORT_SYMBOL(__f_setown);
***************
*** 446,452 ****
  }
  
  SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
! {	
  	struct fd f = fdget_raw(fd);
  	long err = -EBADF;
  
--- 449,455 ----
  }
  
  SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
! {
  	struct fd f = fdget_raw(fd);
  	long err = -EBADF;
  
***************
*** 471,477 ****
  #if BITS_PER_LONG == 32
  SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
  		unsigned long, arg)
! {	
  	void __user *argp = (void __user *)arg;
  	struct fd f = fdget_raw(fd);
  	struct flock64 flock;
--- 474,480 ----
  #if BITS_PER_LONG == 32
  SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
  		unsigned long, arg)
! {
  	void __user *argp = (void __user *)arg;
  	struct fd f = fdget_raw(fd);
  	struct flock64 flock;
***************
*** 488,494 ****
  	err = security_file_fcntl(f.file, cmd, arg);
  	if (err)
  		goto out1;
! 	
  	switch (cmd) {
  	case F_GETLK64:
  	case F_OFD_GETLK:
--- 491,497 ----
  	err = security_file_fcntl(f.file, cmd, arg);
  	if (err)
  		goto out1;
! 
  	switch (cmd) {
  	case F_GETLK64:
  	case F_OFD_GETLK:
***************
*** 738,745 ****
  		kernel_siginfo_t si;
  		default:
  			/* Queue a rt signal with the appropriate fd as its
! 			   value.  We use SI_SIGIO as the source, not 
! 			   SI_KERNEL, since kernel signals always get 
  			   delivered even if we can't queue.  Failure to
  			   queue in this case _should_ be reported; we fall
  			   back to SIGIO in that case. --sct */
--- 741,748 ----
  		kernel_siginfo_t si;
  		default:
  			/* Queue a rt signal with the appropriate fd as its
! 			   value.  We use SI_SIGIO as the source, not
! 			   SI_KERNEL, since kernel signals always get
  			   delivered even if we can't queue.  Failure to
  			   queue in this case _should_ be reported; we fall
  			   back to SIGIO in that case. --sct */
***************
*** 781,787 ****
  	enum pid_type type;
  	unsigned long flags;
  	struct pid *pid;
! 	
  	read_lock_irqsave(&fown->lock, flags);
  
  	type = fown->pid_type;
--- 784,790 ----
  	enum pid_type type;
  	unsigned long flags;
  	struct pid *pid;
! 
  	read_lock_irqsave(&fown->lock, flags);
  
  	type = fown->pid_type;
***************
*** 820,826 ****
  	struct pid *pid;
  	unsigned long flags;
  	int ret = 0;
! 	
  	read_lock_irqsave(&fown->lock, flags);
  
  	type = fown->pid_type;
--- 823,829 ----
  	struct pid *pid;
  	unsigned long flags;
  	int ret = 0;
! 
  	read_lock_irqsave(&fown->lock, flags);
  
  	type = fown->pid_type;
diff --color -rcNP Master/fs/fcntl.c.orig OG/fs/fcntl.c.orig
*** Master/fs/fcntl.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/fcntl.c.orig	2021-04-20 15:10:45.380000000 -0400
***************
*** 0 ****
--- 1,1049 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  *  linux/fs/fcntl.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ #include <linux/syscalls.h>
+ #include <linux/init.h>
+ #include <linux/mm.h>
+ #include <linux/sched/task.h>
+ #include <linux/fs.h>
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/capability.h>
+ #include <linux/dnotify.h>
+ #include <linux/slab.h>
+ #include <linux/module.h>
+ #include <linux/pipe_fs_i.h>
+ #include <linux/security.h>
+ #include <linux/ptrace.h>
+ #include <linux/signal.h>
+ #include <linux/rcupdate.h>
+ #include <linux/pid_namespace.h>
+ #include <linux/user_namespace.h>
+ #include <linux/memfd.h>
+ #include <linux/compat.h>
+ #include <linux/minisec.h>
+ 
+ #include <linux/poll.h>
+ #include <asm/siginfo.h>
+ #include <linux/uaccess.h>
+ 
+ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+ 
+ static int setfl(int fd, struct file * filp, unsigned long arg)
+ {
+ 	struct inode * inode = file_inode(filp);
+ 	int error = 0;
+ 
+ 	/*
+ 	 * O_APPEND cannot be cleared if the file is marked as append-only
+ 	 * and the file is open for write.
+ 	 */
+ 	if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
+ 		return -EPERM;
+ 
+ 	/* O_NOATIME can only be set by the owner or superuser */
+ 	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
+ 		if (!inode_owner_or_capable(inode))
+ 			return -EPERM;
+ 
+ 	/* required for strict SunOS emulation */
+ 	if (O_NONBLOCK != O_NDELAY)
+ 	       if (arg & O_NDELAY)
+ 		   arg |= O_NONBLOCK;
+ 
+ 	/* Pipe packetized mode is controlled by O_DIRECT flag */
+ 	if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
+ 		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
+ 			!filp->f_mapping->a_ops->direct_IO)
+ 				return -EINVAL;
+ 	}
+ 
+ 	if (filp->f_op->check_flags)
+ 		error = filp->f_op->check_flags(arg);
+ 	if (error)
+ 		return error;
+ 
+ 	/*
+ 	 * ->fasync() is responsible for setting the FASYNC bit.
+ 	 */
+ 	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
+ 		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+ 		if (error < 0)
+ 			goto out;
+ 		if (error > 0)
+ 			error = 0;
+ 	}
+ 	spin_lock(&filp->f_lock);
+ 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+ 	spin_unlock(&filp->f_lock);
+ 
+  out:
+ 	return error;
+ }
+ 
+ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
+                      int force)
+ {
+ 	write_lock_irq(&filp->f_owner.lock);
+ 	if (force || !filp->f_owner.pid) {
+ 		put_pid(filp->f_owner.pid);
+ 		filp->f_owner.pid = get_pid(pid);
+ 		filp->f_owner.pid_type = type;
+ 
+ 		if (pid) {
+ 			const struct cred *cred = current_cred();
+ 			filp->f_owner.uid = cred->uid;
+ 			filp->f_owner.euid = cred->euid;
+ 		}
+ 	}
+ 	write_unlock_irq(&filp->f_owner.lock);
+ }
+ 
+ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
+ 		int force)
+ {
+ 	security_file_set_fowner(filp);
+ 	if (gr_handle_chroot_fowner(pid, type))
+ 		return;
+ 	f_modown(filp, pid, type, force);
+ }
+ EXPORT_SYMBOL(__f_setown);
+ 
+ int f_setown(struct file *filp, unsigned long arg, int force)
+ {
+ 	enum pid_type type;
+ 	struct pid *pid = NULL;
+ 	int who = arg, ret = 0;
+ 
+ 	type = PIDTYPE_TGID;
+ 	if (who < 0) {
+ 		/* avoid overflow below */
+ 		if (who == INT_MIN)
+ 			return -EINVAL;
+ 
+ 		type = PIDTYPE_PGID;
+ 		who = -who;
+ 	}
+ 
+ 	rcu_read_lock();
+ 	if (who) {
+ 		pid = find_vpid(who);
+ 		if (!pid)
+ 			ret = -ESRCH;
+ 	}
+ 
+ 	if (!ret)
+ 		__f_setown(filp, pid, type, force);
+ 	rcu_read_unlock();
+ 
+ 	return ret;
+ }
+ EXPORT_SYMBOL(f_setown);
+ 
+ void f_delown(struct file *filp)
+ {
+ 	f_modown(filp, NULL, PIDTYPE_TGID, 1);
+ }
+ 
+ pid_t f_getown(struct file *filp)
+ {
+ 	pid_t pid;
+ 	read_lock(&filp->f_owner.lock);
+ 	pid = pid_vnr(filp->f_owner.pid);
+ 	if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ 		pid = -pid;
+ 	read_unlock(&filp->f_owner.lock);
+ 	return pid;
+ }
+ 
+ static int f_setown_ex(struct file *filp, unsigned long arg)
+ {
+ 	struct f_owner_ex __user *owner_p = (void __user *)arg;
+ 	struct f_owner_ex owner;
+ 	struct pid *pid;
+ 	int type;
+ 	int ret;
+ 
+ 	ret = copy_from_user(&owner, owner_p, sizeof(owner));
+ 	if (ret)
+ 		return -EFAULT;
+ 
+ 	switch (owner.type) {
+ 	case F_OWNER_TID:
+ 		type = PIDTYPE_PID;
+ 		break;
+ 
+ 	case F_OWNER_PID:
+ 		type = PIDTYPE_TGID;
+ 		break;
+ 
+ 	case F_OWNER_PGRP:
+ 		type = PIDTYPE_PGID;
+ 		break;
+ 
+ 	default:
+ 		return -EINVAL;
+ 	}
+ 
+ 	rcu_read_lock();
+ 	pid = find_vpid(owner.pid);
+ 	if (owner.pid && !pid)
+ 		ret = -ESRCH;
+ 	else
+ 		 __f_setown(filp, pid, type, 1);
+ 	rcu_read_unlock();
+ 
+ 	return ret;
+ }
+ 
+ static int f_getown_ex(struct file *filp, unsigned long arg)
+ {
+ 	struct f_owner_ex __user *owner_p = (void __user *)arg;
+ 	struct f_owner_ex owner;
+ 	int ret = 0;
+ 
+ 	read_lock(&filp->f_owner.lock);
+ 	owner.pid = pid_vnr(filp->f_owner.pid);
+ 	switch (filp->f_owner.pid_type) {
+ 	case PIDTYPE_PID:
+ 		owner.type = F_OWNER_TID;
+ 		break;
+ 
+ 	case PIDTYPE_TGID:
+ 		owner.type = F_OWNER_PID;
+ 		break;
+ 
+ 	case PIDTYPE_PGID:
+ 		owner.type = F_OWNER_PGRP;
+ 		break;
+ 
+ 	default:
+ 		WARN_ON(1);
+ 		ret = -EINVAL;
+ 		break;
+ 	}
+ 	read_unlock(&filp->f_owner.lock);
+ 
+ 	if (!ret) {
+ 		ret = copy_to_user(owner_p, &owner, sizeof(owner));
+ 		if (ret)
+ 			ret = -EFAULT;
+ 	}
+ 	return ret;
+ }
+ 
+ #ifdef CONFIG_CHECKPOINT_RESTORE
+ static int f_getowner_uids(struct file *filp, unsigned long arg)
+ {
+ 	struct user_namespace *user_ns = current_user_ns();
+ 	uid_t __user *dst = (void __user *)arg;
+ 	uid_t src[2];
+ 	int err;
+ 
+ 	read_lock(&filp->f_owner.lock);
+ 	src[0] = from_kuid(user_ns, filp->f_owner.uid);
+ 	src[1] = from_kuid(user_ns, filp->f_owner.euid);
+ 	read_unlock(&filp->f_owner.lock);
+ 
+ 	err  = put_user(src[0], &dst[0]);
+ 	err |= put_user(src[1], &dst[1]);
+ 
+ 	return err;
+ }
+ #else
+ static int f_getowner_uids(struct file *filp, unsigned long arg)
+ {
+ 	return -EINVAL;
+ }
+ #endif
+ 
+ static bool rw_hint_valid(enum rw_hint hint)
+ {
+ 	switch (hint) {
+ 	case RWF_WRITE_LIFE_NOT_SET:
+ 	case RWH_WRITE_LIFE_NONE:
+ 	case RWH_WRITE_LIFE_SHORT:
+ 	case RWH_WRITE_LIFE_MEDIUM:
+ 	case RWH_WRITE_LIFE_LONG:
+ 	case RWH_WRITE_LIFE_EXTREME:
+ 		return true;
+ 	default:
+ 		return false;
+ 	}
+ }
+ 
+ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ 			  unsigned long arg)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	u64 *argp = (u64 __user *)arg;
+ 	enum rw_hint hint;
+ 	u64 h;
+ 
+ 	switch (cmd) {
+ 	case F_GET_FILE_RW_HINT:
+ 		h = file_write_hint(file);
+ 		if (copy_to_user(argp, &h, sizeof(*argp)))
+ 			return -EFAULT;
+ 		return 0;
+ 	case F_SET_FILE_RW_HINT:
+ 		if (copy_from_user(&h, argp, sizeof(h)))
+ 			return -EFAULT;
+ 		hint = (enum rw_hint) h;
+ 		if (!rw_hint_valid(hint))
+ 			return -EINVAL;
+ 
+ 		spin_lock(&file->f_lock);
+ 		file->f_write_hint = hint;
+ 		spin_unlock(&file->f_lock);
+ 		return 0;
+ 	case F_GET_RW_HINT:
+ 		h = inode->i_write_hint;
+ 		if (copy_to_user(argp, &h, sizeof(*argp)))
+ 			return -EFAULT;
+ 		return 0;
+ 	case F_SET_RW_HINT:
+ 		if (copy_from_user(&h, argp, sizeof(h)))
+ 			return -EFAULT;
+ 		hint = (enum rw_hint) h;
+ 		if (!rw_hint_valid(hint))
+ 			return -EINVAL;
+ 
+ 		inode_lock(inode);
+ 		inode->i_write_hint = hint;
+ 		inode_unlock(inode);
+ 		return 0;
+ 	default:
+ 		return -EINVAL;
+ 	}
+ }
+ 
+ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
+ 		struct file *filp)
+ {
+ 	void __user *argp = (void __user *)arg;
+ 	struct flock flock;
+ 	long err = -EINVAL;
+ 
+ 	switch (cmd) {
+ 	case F_DUPFD:
+ 		err = f_dupfd(arg, filp, 0);
+ 		break;
+ 	case F_DUPFD_CLOEXEC:
+ 		err = f_dupfd(arg, filp, O_CLOEXEC);
+ 		break;
+ 	case F_GETFD:
+ 		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+ 		break;
+ 	case F_SETFD:
+ 		err = 0;
+ 		set_close_on_exec(fd, arg & FD_CLOEXEC);
+ 		break;
+ 	case F_GETFL:
+ 		err = filp->f_flags;
+ 		break;
+ 	case F_SETFL:
+ 		err = setfl(fd, filp, arg);
+ 		break;
+ #if BITS_PER_LONG != 32
+ 	/* 32-bit arches must use fcntl64() */
+ 	case F_OFD_GETLK:
+ #endif
+ 	case F_GETLK:
+ 		if (copy_from_user(&flock, argp, sizeof(flock)))
+ 			return -EFAULT;
+ 		err = fcntl_getlk(filp, cmd, &flock);
+ 		if (!err && copy_to_user(argp, &flock, sizeof(flock)))
+ 			return -EFAULT;
+ 		break;
+ #if BITS_PER_LONG != 32
+ 	/* 32-bit arches must use fcntl64() */
+ 	case F_OFD_SETLK:
+ 	case F_OFD_SETLKW:
+ #endif
+ 		/* Fallthrough */
+ 	case F_SETLK:
+ 	case F_SETLKW:
+ 		if (copy_from_user(&flock, argp, sizeof(flock)))
+ 			return -EFAULT;
+ 		err = fcntl_setlk(fd, filp, cmd, &flock);
+ 		break;
+ 	case F_GETOWN:
+ 		/*
+ 		 * XXX If f_owner is a process group, the
+ 		 * negative return value will get converted
+ 		 * into an error.  Oops.  If we keep the
+ 		 * current syscall conventions, the only way
+ 		 * to fix this will be in libc.
+ 		 */
+ 		err = f_getown(filp);
+ 		force_successful_syscall_return();
+ 		break;
+ 	case F_SETOWN:
+ 		err = f_setown(filp, arg, 1);
+ 		break;
+ 	case F_GETOWN_EX:
+ 		err = f_getown_ex(filp, arg);
+ 		break;
+ 	case F_SETOWN_EX:
+ 		err = f_setown_ex(filp, arg);
+ 		break;
+ 	case F_GETOWNER_UIDS:
+ 		err = f_getowner_uids(filp, arg);
+ 		break;
+ 	case F_GETSIG:
+ 		err = filp->f_owner.signum;
+ 		break;
+ 	case F_SETSIG:
+ 		/* arg == 0 restores default behaviour. */
+ 		if (!valid_signal(arg)) {
+ 			break;
+ 		}
+ 		err = 0;
+ 		filp->f_owner.signum = arg;
+ 		break;
+ 	case F_GETLEASE:
+ 		err = fcntl_getlease(filp);
+ 		break;
+ 	case F_SETLEASE:
+ 		err = fcntl_setlease(fd, filp, arg);
+ 		break;
+ 	case F_NOTIFY:
+ 		err = fcntl_dirnotify(fd, filp, arg);
+ 		break;
+ 	case F_SETPIPE_SZ:
+ 	case F_GETPIPE_SZ:
+ 		err = pipe_fcntl(filp, cmd, arg);
+ 		break;
+ 	case F_ADD_SEALS:
+ 	case F_GET_SEALS:
+ 		err = memfd_fcntl(filp, cmd, arg);
+ 		break;
+ 	case F_GET_RW_HINT:
+ 	case F_SET_RW_HINT:
+ 	case F_GET_FILE_RW_HINT:
+ 	case F_SET_FILE_RW_HINT:
+ 		err = fcntl_rw_hint(filp, cmd, arg);
+ 		break;
+ 	default:
+ 		break;
+ 	}
+ 	return err;
+ }
+ 
+ static int check_fcntl_cmd(unsigned cmd)
+ {
+ 	switch (cmd) {
+ 	case F_DUPFD:
+ 	case F_DUPFD_CLOEXEC:
+ 	case F_GETFD:
+ 	case F_SETFD:
+ 	case F_GETFL:
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+ {	
+ 	struct fd f = fdget_raw(fd);
+ 	long err = -EBADF;
+ 
+ 	if (!f.file)
+ 		goto out;
+ 
+ 	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ 		if (!check_fcntl_cmd(cmd))
+ 			goto out1;
+ 	}
+ 
+ 	err = security_file_fcntl(f.file, cmd, arg);
+ 	if (!err)
+ 		err = do_fcntl(fd, cmd, arg, f.file);
+ 
+ out1:
+  	fdput(f);
+ out:
+ 	return err;
+ }
+ 
+ #if BITS_PER_LONG == 32
+ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+ 		unsigned long, arg)
+ {	
+ 	void __user *argp = (void __user *)arg;
+ 	struct fd f = fdget_raw(fd);
+ 	struct flock64 flock;
+ 	long err = -EBADF;
+ 
+ 	if (!f.file)
+ 		goto out;
+ 
+ 	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ 		if (!check_fcntl_cmd(cmd))
+ 			goto out1;
+ 	}
+ 
+ 	err = security_file_fcntl(f.file, cmd, arg);
+ 	if (err)
+ 		goto out1;
+ 	
+ 	switch (cmd) {
+ 	case F_GETLK64:
+ 	case F_OFD_GETLK:
+ 		err = -EFAULT;
+ 		if (copy_from_user(&flock, argp, sizeof(flock)))
+ 			break;
+ 		err = fcntl_getlk64(f.file, cmd, &flock);
+ 		if (!err && copy_to_user(argp, &flock, sizeof(flock)))
+ 			err = -EFAULT;
+ 		break;
+ 	case F_SETLK64:
+ 	case F_SETLKW64:
+ 	case F_OFD_SETLK:
+ 	case F_OFD_SETLKW:
+ 		err = -EFAULT;
+ 		if (copy_from_user(&flock, argp, sizeof(flock)))
+ 			break;
+ 		err = fcntl_setlk64(fd, f.file, cmd, &flock);
+ 		break;
+ 	default:
+ 		err = do_fcntl(fd, cmd, arg, f.file);
+ 		break;
+ 	}
+ out1:
+ 	fdput(f);
+ out:
+ 	return err;
+ }
+ #endif
+ 
+ #ifdef CONFIG_COMPAT
+ /* careful - don't use anywhere else */
+ #define copy_flock_fields(dst, src)		\
+ 	(dst)->l_type = (src)->l_type;		\
+ 	(dst)->l_whence = (src)->l_whence;	\
+ 	(dst)->l_start = (src)->l_start;	\
+ 	(dst)->l_len = (src)->l_len;		\
+ 	(dst)->l_pid = (src)->l_pid;
+ 
+ static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
+ {
+ 	struct compat_flock fl;
+ 
+ 	if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
+ 		return -EFAULT;
+ 	copy_flock_fields(kfl, &fl);
+ 	return 0;
+ }
+ 
+ static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
+ {
+ 	struct compat_flock64 fl;
+ 
+ 	if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
+ 		return -EFAULT;
+ 	copy_flock_fields(kfl, &fl);
+ 	return 0;
+ }
+ 
+ static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
+ {
+ 	struct compat_flock fl;
+ 
+ 	memset(&fl, 0, sizeof(struct compat_flock));
+ 	copy_flock_fields(&fl, kfl);
+ 	if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
+ 		return -EFAULT;
+ 	return 0;
+ }
+ 
+ static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
+ {
+ 	struct compat_flock64 fl;
+ 
+ 	BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
+ 	BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));
+ 
+ 	memset(&fl, 0, sizeof(struct compat_flock64));
+ 	copy_flock_fields(&fl, kfl);
+ 	if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
+ 		return -EFAULT;
+ 	return 0;
+ }
+ #undef copy_flock_fields
+ 
+ static unsigned int
+ convert_fcntl_cmd(unsigned int cmd)
+ {
+ 	switch (cmd) {
+ 	case F_GETLK64:
+ 		return F_GETLK;
+ 	case F_SETLK64:
+ 		return F_SETLK;
+ 	case F_SETLKW64:
+ 		return F_SETLKW;
+ 	}
+ 
+ 	return cmd;
+ }
+ 
+ /*
+  * GETLK was successful and we need to return the data, but it needs to fit in
+  * the compat structure.
+  * l_start shouldn't be too big, unless the original start + end is greater than
+  * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
+  * -EOVERFLOW in that case.  l_len could be too big, in which case we just
+  * truncate it, and only allow the app to see that part of the conflicting lock
+  * that might make sense to it anyway
+  */
+ static int fixup_compat_flock(struct flock *flock)
+ {
+ 	if (flock->l_start > COMPAT_OFF_T_MAX)
+ 		return -EOVERFLOW;
+ 	if (flock->l_len > COMPAT_OFF_T_MAX)
+ 		flock->l_len = COMPAT_OFF_T_MAX;
+ 	return 0;
+ }
+ 
+ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
+ 			     compat_ulong_t arg)
+ {
+ 	struct fd f = fdget_raw(fd);
+ 	struct flock flock;
+ 	long err = -EBADF;
+ 
+ 	if (!f.file)
+ 		return err;
+ 
+ 	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+ 		if (!check_fcntl_cmd(cmd))
+ 			goto out_put;
+ 	}
+ 
+ 	err = security_file_fcntl(f.file, cmd, arg);
+ 	if (err)
+ 		goto out_put;
+ 
+ 	switch (cmd) {
+ 	case F_GETLK:
+ 		err = get_compat_flock(&flock, compat_ptr(arg));
+ 		if (err)
+ 			break;
+ 		err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ 		if (err)
+ 			break;
+ 		err = fixup_compat_flock(&flock);
+ 		if (!err)
+ 			err = put_compat_flock(&flock, compat_ptr(arg));
+ 		break;
+ 	case F_GETLK64:
+ 	case F_OFD_GETLK:
+ 		err = get_compat_flock64(&flock, compat_ptr(arg));
+ 		if (err)
+ 			break;
+ 		err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+ 		if (!err)
+ 			err = put_compat_flock64(&flock, compat_ptr(arg));
+ 		break;
+ 	case F_SETLK:
+ 	case F_SETLKW:
+ 		err = get_compat_flock(&flock, compat_ptr(arg));
+ 		if (err)
+ 			break;
+ 		err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+ 		break;
+ 	case F_SETLK64:
+ 	case F_SETLKW64:
+ 	case F_OFD_SETLK:
+ 	case F_OFD_SETLKW:
+ 		err = get_compat_flock64(&flock, compat_ptr(arg));
+ 		if (err)
+ 			break;
+ 		err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+ 		break;
+ 	default:
+ 		err = do_fcntl(fd, cmd, arg, f.file);
+ 		break;
+ 	}
+ out_put:
+ 	fdput(f);
+ 	return err;
+ }
+ 
+ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+ 		       compat_ulong_t, arg)
+ {
+ 	return do_compat_fcntl64(fd, cmd, arg);
+ }
+ 
+ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+ 		       compat_ulong_t, arg)
+ {
+ 	switch (cmd) {
+ 	case F_GETLK64:
+ 	case F_SETLK64:
+ 	case F_SETLKW64:
+ 	case F_OFD_GETLK:
+ 	case F_OFD_SETLK:
+ 	case F_OFD_SETLKW:
+ 		return -EINVAL;
+ 	}
+ 	return do_compat_fcntl64(fd, cmd, arg);
+ }
+ #endif
+ 
+ /* Table to convert sigio signal codes into poll band bitmaps */
+ 
+ static const __poll_t band_table[NSIGPOLL] = {
+ 	EPOLLIN | EPOLLRDNORM,			/* POLL_IN */
+ 	EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,	/* POLL_OUT */
+ 	EPOLLIN | EPOLLRDNORM | EPOLLMSG,		/* POLL_MSG */
+ 	EPOLLERR,				/* POLL_ERR */
+ 	EPOLLPRI | EPOLLRDBAND,			/* POLL_PRI */
+ 	EPOLLHUP | EPOLLERR			/* POLL_HUP */
+ };
+ 
+ static inline int sigio_perm(struct task_struct *p,
+                              struct fown_struct *fown, int sig)
+ {
+ 	const struct cred *cred;
+ 	int ret;
+ 
+ 	rcu_read_lock();
+ 	cred = __task_cred(p);
+ 	ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
+ 		uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
+ 		uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
+ 	       !security_file_send_sigiotask(p, fown, sig));
+ 	rcu_read_unlock();
+ 	return ret;
+ }
+ 
+ static void send_sigio_to_task(struct task_struct *p,
+ 			       struct fown_struct *fown,
+ 			       int fd, int reason, enum pid_type type)
+ {
+ 	/*
+ 	 * F_SETSIG can change ->signum lockless in parallel, make
+ 	 * sure we read it once and use the same value throughout.
+ 	 */
+ 	int signum = READ_ONCE(fown->signum);
+ 
+ 	if (!sigio_perm(p, fown, signum))
+ 		return;
+ 
+ 	switch (signum) {
+ 		kernel_siginfo_t si;
+ 		default:
+ 			/* Queue a rt signal with the appropriate fd as its
+ 			   value.  We use SI_SIGIO as the source, not 
+ 			   SI_KERNEL, since kernel signals always get 
+ 			   delivered even if we can't queue.  Failure to
+ 			   queue in this case _should_ be reported; we fall
+ 			   back to SIGIO in that case. --sct */
+ 			clear_siginfo(&si);
+ 			si.si_signo = signum;
+ 			si.si_errno = 0;
+ 		        si.si_code  = reason;
+ 			/*
+ 			 * Posix definies POLL_IN and friends to be signal
+ 			 * specific si_codes for SIG_POLL.  Linux extended
+ 			 * these si_codes to other signals in a way that is
+ 			 * ambiguous if other signals also have signal
+ 			 * specific si_codes.  In that case use SI_SIGIO instead
+ 			 * to remove the ambiguity.
+ 			 */
+ 			if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
+ 				si.si_code = SI_SIGIO;
+ 
+ 			/* Make sure we are called with one of the POLL_*
+ 			   reasons, otherwise we could leak kernel stack into
+ 			   userspace.  */
+ 			BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
+ 			if (reason - POLL_IN >= NSIGPOLL)
+ 				si.si_band  = ~0L;
+ 			else
+ 				si.si_band = mangle_poll(band_table[reason - POLL_IN]);
+ 			si.si_fd    = fd;
+ 			if (!do_send_sig_info(signum, &si, p, type))
+ 				break;
+ 		/* fall-through - fall back on the old plain SIGIO signal */
+ 		case 0:
+ 			do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
+ 	}
+ }
+ 
+ void send_sigio(struct fown_struct *fown, int fd, int band)
+ {
+ 	struct task_struct *p;
+ 	enum pid_type type;
+ 	unsigned long flags;
+ 	struct pid *pid;
+ 	
+ 	read_lock_irqsave(&fown->lock, flags);
+ 
+ 	type = fown->pid_type;
+ 	pid = fown->pid;
+ 	if (!pid)
+ 		goto out_unlock_fown;
+ 
+ 	if (type <= PIDTYPE_TGID) {
+ 		rcu_read_lock();
+ 		p = pid_task(pid, PIDTYPE_PID);
+ 		if (p)
+ 			send_sigio_to_task(p, fown, fd, band, type);
+ 		rcu_read_unlock();
+ 	} else {
+ 		read_lock(&tasklist_lock);
+ 		do_each_pid_task(pid, type, p) {
+ 			send_sigio_to_task(p, fown, fd, band, type);
+ 		} while_each_pid_task(pid, type, p);
+ 		read_unlock(&tasklist_lock);
+ 	}
+  out_unlock_fown:
+ 	read_unlock_irqrestore(&fown->lock, flags);
+ }
+ 
+ static void send_sigurg_to_task(struct task_struct *p,
+ 				struct fown_struct *fown, enum pid_type type)
+ {
+ 	if (sigio_perm(p, fown, SIGURG))
+ 		do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
+ }
+ 
+ int send_sigurg(struct fown_struct *fown)
+ {
+ 	struct task_struct *p;
+ 	enum pid_type type;
+ 	struct pid *pid;
+ 	unsigned long flags;
+ 	int ret = 0;
+ 	
+ 	read_lock_irqsave(&fown->lock, flags);
+ 
+ 	type = fown->pid_type;
+ 	pid = fown->pid;
+ 	if (!pid)
+ 		goto out_unlock_fown;
+ 
+ 	ret = 1;
+ 
+ 	if (type <= PIDTYPE_TGID) {
+ 		rcu_read_lock();
+ 		p = pid_task(pid, PIDTYPE_PID);
+ 		if (p)
+ 			send_sigurg_to_task(p, fown, type);
+ 		rcu_read_unlock();
+ 	} else {
+ 		read_lock(&tasklist_lock);
+ 		do_each_pid_task(pid, type, p) {
+ 			send_sigurg_to_task(p, fown, type);
+ 		} while_each_pid_task(pid, type, p);
+ 		read_unlock(&tasklist_lock);
+ 	}
+  out_unlock_fown:
+ 	read_unlock_irqrestore(&fown->lock, flags);
+ 	return ret;
+ }
+ 
+ static DEFINE_SPINLOCK(fasync_lock);
+ static struct kmem_cache *fasync_cache __read_mostly;
+ 
+ static void fasync_free_rcu(struct rcu_head *head)
+ {
+ 	kmem_cache_free(fasync_cache,
+ 			container_of(head, struct fasync_struct, fa_rcu));
+ }
+ 
+ /*
+  * Remove a fasync entry. If successfully removed, return
+  * positive and clear the FASYNC flag. If no entry exists,
+  * do nothing and return 0.
+  *
+  * NOTE! It is very important that the FASYNC flag always
+  * match the state "is the filp on a fasync list".
+  *
+  */
+ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
+ {
+ 	struct fasync_struct *fa, **fp;
+ 	int result = 0;
+ 
+ 	spin_lock(&filp->f_lock);
+ 	spin_lock(&fasync_lock);
+ 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+ 		if (fa->fa_file != filp)
+ 			continue;
+ 
+ 		write_lock_irq(&fa->fa_lock);
+ 		fa->fa_file = NULL;
+ 		write_unlock_irq(&fa->fa_lock);
+ 
+ 		*fp = fa->fa_next;
+ 		call_rcu(&fa->fa_rcu, fasync_free_rcu);
+ 		filp->f_flags &= ~FASYNC;
+ 		result = 1;
+ 		break;
+ 	}
+ 	spin_unlock(&fasync_lock);
+ 	spin_unlock(&filp->f_lock);
+ 	return result;
+ }
+ 
+ struct fasync_struct *fasync_alloc(void)
+ {
+ 	return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
+ }
+ 
+ /*
+  * NOTE! This can be used only for unused fasync entries:
+  * entries that actually got inserted on the fasync list
+  * need to be released by rcu - see fasync_remove_entry.
+  */
+ void fasync_free(struct fasync_struct *new)
+ {
+ 	kmem_cache_free(fasync_cache, new);
+ }
+ 
+ /*
+  * Insert a new entry into the fasync list.  Return the pointer to the
+  * old one if we didn't use the new one.
+  *
+  * NOTE! It is very important that the FASYNC flag always
+  * match the state "is the filp on a fasync list".
+  */
+ struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
+ {
+         struct fasync_struct *fa, **fp;
+ 
+ 	spin_lock(&filp->f_lock);
+ 	spin_lock(&fasync_lock);
+ 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
+ 		if (fa->fa_file != filp)
+ 			continue;
+ 
+ 		write_lock_irq(&fa->fa_lock);
+ 		fa->fa_fd = fd;
+ 		write_unlock_irq(&fa->fa_lock);
+ 		goto out;
+ 	}
+ 
+ 	rwlock_init(&new->fa_lock);
+ 	new->magic = FASYNC_MAGIC;
+ 	new->fa_file = filp;
+ 	new->fa_fd = fd;
+ 	new->fa_next = *fapp;
+ 	rcu_assign_pointer(*fapp, new);
+ 	filp->f_flags |= FASYNC;
+ 
+ out:
+ 	spin_unlock(&fasync_lock);
+ 	spin_unlock(&filp->f_lock);
+ 	return fa;
+ }
+ 
+ /*
+  * Add a fasync entry. Return negative on error, positive if
+  * added, and zero if did nothing but change an existing one.
+  */
+ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
+ {
+ 	struct fasync_struct *new;
+ 
+ 	new = fasync_alloc();
+ 	if (!new)
+ 		return -ENOMEM;
+ 
+ 	/*
+ 	 * fasync_insert_entry() returns the old (update) entry if
+ 	 * it existed.
+ 	 *
+ 	 * So free the (unused) new entry and return 0 to let the
+ 	 * caller know that we didn't add any new fasync entries.
+ 	 */
+ 	if (fasync_insert_entry(fd, filp, fapp, new)) {
+ 		fasync_free(new);
+ 		return 0;
+ 	}
+ 
+ 	return 1;
+ }
+ 
+ /*
+  * fasync_helper() is used by almost all character device drivers
+  * to set up the fasync queue, and for regular files by the file
+  * lease code. It returns negative on error, 0 if it did no changes
+  * and positive if it added/deleted the entry.
+  */
+ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
+ {
+ 	if (!on)
+ 		return fasync_remove_entry(filp, fapp);
+ 	return fasync_add_entry(fd, filp, fapp);
+ }
+ 
+ EXPORT_SYMBOL(fasync_helper);
+ 
+ /*
+  * rcu_read_lock() is held
+  */
+ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
+ {
+ 	while (fa) {
+ 		struct fown_struct *fown;
+ 
+ 		if (fa->magic != FASYNC_MAGIC) {
+ 			printk(KERN_ERR "kill_fasync: bad magic number in "
+ 			       "fasync_struct!\n");
+ 			return;
+ 		}
+ 		read_lock(&fa->fa_lock);
+ 		if (fa->fa_file) {
+ 			fown = &fa->fa_file->f_owner;
+ 			/* Don't send SIGURG to processes which have not set a
+ 			   queued signum: SIGURG has its own default signalling
+ 			   mechanism. */
+ 			if (!(sig == SIGURG && fown->signum == 0))
+ 				send_sigio(fown, fa->fa_fd, band);
+ 		}
+ 		read_unlock(&fa->fa_lock);
+ 		fa = rcu_dereference(fa->fa_next);
+ 	}
+ }
+ 
+ void kill_fasync(struct fasync_struct **fp, int sig, int band)
+ {
+ 	/* First a quick test without locking: usually
+ 	 * the list is empty.
+ 	 */
+ 	if (*fp) {
+ 		rcu_read_lock();
+ 		kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+ 		rcu_read_unlock();
+ 	}
+ }
+ EXPORT_SYMBOL(kill_fasync);
+ 
+ static int __init fcntl_init(void)
+ {
+ 	/*
+ 	 * Please add new bits here to ensure allocation uniqueness.
+ 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
+ 	 * is defined as O_NONBLOCK on some platforms and not on others.
+ 	 */
+ 	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+ 		HWEIGHT32(
+ 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
+ 			__FMODE_EXEC | __FMODE_NONOTIFY));
+ 
+ 	fasync_cache = kmem_cache_create("fasync_cache",
+ 		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+ 	return 0;
+ }
+ 
+ module_init(fcntl_init)
diff --color -rcNP Master/fs/fcntl.c.rej OG/fs/fcntl.c.rej
*** Master/fs/fcntl.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/fcntl.c.rej	2021-04-20 15:11:27.315000000 -0400
***************
*** 0 ****
--- 1,106 ----
+ *** fs/fcntl.c	2021-03-13 14:12:49.000000000 +0200
+ --- fs/fcntl.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 446,452 ****
+   }
+   
+   SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+ ! {
+   	struct fd f = fdget_raw(fd);
+   	long err = -EBADF;
+   
+ --- 443,449 ----
+   }
+   
+   SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
+ ! {
+   	struct fd f = fdget_raw(fd);
+   	long err = -EBADF;
+   
+ ***************
+ *** 471,477 ****
+   #if BITS_PER_LONG == 32
+   SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+   		unsigned long, arg)
+ ! {
+   	void __user *argp = (void __user *)arg;
+   	struct fd f = fdget_raw(fd);
+   	struct flock64 flock;
+ --- 468,474 ----
+   #if BITS_PER_LONG == 32
+   SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+   		unsigned long, arg)
+ ! {
+   	void __user *argp = (void __user *)arg;
+   	struct fd f = fdget_raw(fd);
+   	struct flock64 flock;
+ ***************
+ *** 488,494 ****
+   	err = security_file_fcntl(f.file, cmd, arg);
+   	if (err)
+   		goto out1;
+ ! 
+   	switch (cmd) {
+   	case F_GETLK64:
+   	case F_OFD_GETLK:
+ --- 485,491 ----
+   	err = security_file_fcntl(f.file, cmd, arg);
+   	if (err)
+   		goto out1;
+ ! 
+   	switch (cmd) {
+   	case F_GETLK64:
+   	case F_OFD_GETLK:
+ ***************
+ *** 738,745 ****
+   		kernel_siginfo_t si;
+   		default:
+   			/* Queue a rt signal with the appropriate fd as its
+ ! 			   value.  We use SI_SIGIO as the source, not
+ ! 			   SI_KERNEL, since kernel signals always get
+   			   delivered even if we can't queue.  Failure to
+   			   queue in this case _should_ be reported; we fall
+   			   back to SIGIO in that case. --sct */
+ --- 735,742 ----
+   		kernel_siginfo_t si;
+   		default:
+   			/* Queue a rt signal with the appropriate fd as its
+ ! 			   value.  We use SI_SIGIO as the source, not
+ ! 			   SI_KERNEL, since kernel signals always get
+   			   delivered even if we can't queue.  Failure to
+   			   queue in this case _should_ be reported; we fall
+   			   back to SIGIO in that case. --sct */
+ ***************
+ *** 781,787 ****
+   	enum pid_type type;
+   	unsigned long flags;
+   	struct pid *pid;
+ ! 
+   	read_lock_irqsave(&fown->lock, flags);
+   
+   	type = fown->pid_type;
+ --- 778,784 ----
+   	enum pid_type type;
+   	unsigned long flags;
+   	struct pid *pid;
+ ! 
+   	read_lock_irqsave(&fown->lock, flags);
+   
+   	type = fown->pid_type;
+ ***************
+ *** 820,826 ****
+   	struct pid *pid;
+   	unsigned long flags;
+   	int ret = 0;
+ ! 
+   	read_lock_irqsave(&fown->lock, flags);
+   
+   	type = fown->pid_type;
+ --- 817,823 ----
+   	struct pid *pid;
+   	unsigned long flags;
+   	int ret = 0;
+ ! 
+   	read_lock_irqsave(&fown->lock, flags);
+   
+   	type = fown->pid_type;
diff --color -rcNP Master/fs/namei.c OG/fs/namei.c
*** Master/fs/namei.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/namei.c	2021-04-20 15:11:34.510000000 -0400
***************
*** 39,44 ****
--- 39,45 ----
  #include <linux/bitops.h>
  #include <linux/init_task.h>
  #include <linux/uaccess.h>
+ #include <linux/minisec.h>
  
  #include "internal.h"
  #include "mount.h"
***************
*** 52,59 ****
   * The new code replaces the old recursive symlink resolution with
   * an iterative one (in case of non-nested symlink chains).  It does
   * this with calls to <fs>_follow_link().
!  * As a side effect, dir_namei(), _namei() and follow_link() are now 
!  * replaced with a single function lookup_dentry() that can handle all 
   * the special cases of the former code.
   *
   * With the new dcache, the pathname is stored at each inode, at least as
--- 53,60 ----
   * The new code replaces the old recursive symlink resolution with
   * an iterative one (in case of non-nested symlink chains).  It does
   * this with calls to <fs>_follow_link().
!  * As a side effect, dir_namei(), _namei() and follow_link() are now
!  * replaced with a single function lookup_dentry() that can handle all
   * the special cases of the former code.
   *
   * With the new dcache, the pathname is stored at each inode, at least as
***************
*** 3751,3756 ****
--- 3752,3763 ----
  
  	if (!IS_POSIXACL(path.dentry->d_inode))
  		mode &= ~current_umask();
+ 
+ 	if (gr_handle_chroot_mknod(dentry, path.mnt, mode)) {
+ 		error = -EPERM;
+ 		goto out;
+ 	}
+ 
  	error = security_path_mknod(&path, dentry, mode, dev);
  	if (error)
  		goto out;
diff --color -rcNP Master/fs/namei.c.orig OG/fs/namei.c.orig
*** Master/fs/namei.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/namei.c.orig	2021-04-20 15:10:45.381000000 -0400
***************
*** 0 ****
--- 1,4865 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  *  linux/fs/namei.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ /*
+  * Some corrections by tytso.
+  */
+ 
+ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
+  * lookup logic.
+  */
+ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
+  */
+ 
+ #include <linux/init.h>
+ #include <linux/export.h>
+ #include <linux/kernel.h>
+ #include <linux/slab.h>
+ #include <linux/fs.h>
+ #include <linux/namei.h>
+ #include <linux/pagemap.h>
+ #include <linux/fsnotify.h>
+ #include <linux/personality.h>
+ #include <linux/security.h>
+ #include <linux/ima.h>
+ #include <linux/syscalls.h>
+ #include <linux/mount.h>
+ #include <linux/audit.h>
+ #include <linux/capability.h>
+ #include <linux/file.h>
+ #include <linux/fcntl.h>
+ #include <linux/device_cgroup.h>
+ #include <linux/fs_struct.h>
+ #include <linux/posix_acl.h>
+ #include <linux/hash.h>
+ #include <linux/bitops.h>
+ #include <linux/init_task.h>
+ #include <linux/uaccess.h>
+ #include <linux/minisec.h>
+ 
+ #include "internal.h"
+ #include "mount.h"
+ 
+ /* [Feb-1997 T. Schoebel-Theuer]
+  * Fundamental changes in the pathname lookup mechanisms (namei)
+  * were necessary because of omirr.  The reason is that omirr needs
+  * to know the _real_ pathname, not the user-supplied one, in case
+  * of symlinks (and also when transname replacements occur).
+  *
+  * The new code replaces the old recursive symlink resolution with
+  * an iterative one (in case of non-nested symlink chains).  It does
+  * this with calls to <fs>_follow_link().
+  * As a side effect, dir_namei(), _namei() and follow_link() are now 
+  * replaced with a single function lookup_dentry() that can handle all 
+  * the special cases of the former code.
+  *
+  * With the new dcache, the pathname is stored at each inode, at least as
+  * long as the refcount of the inode is positive.  As a side effect, the
+  * size of the dcache depends on the inode cache and thus is dynamic.
+  *
+  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
+  * resolution to correspond with current state of the code.
+  *
+  * Note that the symlink resolution is not *completely* iterative.
+  * There is still a significant amount of tail- and mid- recursion in
+  * the algorithm.  Also, note that <fs>_readlink() is not used in
+  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
+  * may return different results than <fs>_follow_link().  Many virtual
+  * filesystems (including /proc) exhibit this behavior.
+  */
+ 
+ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
+  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
+  * and the name already exists in form of a symlink, try to create the new
+  * name indicated by the symlink. The old code always complained that the
+  * name already exists, due to not following the symlink even if its target
+  * is nonexistent.  The new semantics affects also mknod() and link() when
+  * the name is a symlink pointing to a non-existent name.
+  *
+  * I don't know which semantics is the right one, since I have no access
+  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
+  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
+  * "old" one. Personally, I think the new semantics is much more logical.
+  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
+  * file does succeed in both HP-UX and SunOs, but not in Solaris
+  * and in the old Linux semantics.
+  */
+ 
+ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
+  * semantics.  See the comments in "open_namei" and "do_link" below.
+  *
+  * [10-Sep-98 Alan Modra] Another symlink change.
+  */
+ 
+ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
+  *	inside the path - always follow.
+  *	in the last component in creation/removal/renaming - never follow.
+  *	if LOOKUP_FOLLOW passed - follow.
+  *	if the pathname has trailing slashes - follow.
+  *	otherwise - don't follow.
+  * (applied in that order).
+  *
+  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
+  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
+  * During the 2.4 we need to fix the userland stuff depending on it -
+  * hopefully we will be able to get rid of that wart in 2.5. So far only
+  * XEmacs seems to be relying on it...
+  */
+ /*
+  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
+  * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
+  * any extra contention...
+  */
+ 
+ /* In order to reduce some races, while at the same time doing additional
+  * checking and hopefully speeding things up, we copy filenames to the
+  * kernel data space before using them..
+  *
+  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
+  * PATH_MAX includes the nul terminator --RR.
+  */
+ 
+ #define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
+ 
+ struct filename *
+ getname_flags(const char __user *filename, int flags, int *empty)
+ {
+ 	struct filename *result;
+ 	char *kname;
+ 	int len;
+ 
+ 	result = audit_reusename(filename);
+ 	if (result)
+ 		return result;
+ 
+ 	result = __getname();
+ 	if (unlikely(!result))
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	/*
+ 	 * First, try to embed the struct filename inside the names_cache
+ 	 * allocation
+ 	 */
+ 	kname = (char *)result->iname;
+ 	result->name = kname;
+ 
+ 	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
+ 	if (unlikely(len < 0)) {
+ 		__putname(result);
+ 		return ERR_PTR(len);
+ 	}
+ 
+ 	/*
+ 	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
+ 	 * separate struct filename so we can dedicate the entire
+ 	 * names_cache allocation for the pathname, and re-do the copy from
+ 	 * userland.
+ 	 */
+ 	if (unlikely(len == EMBEDDED_NAME_MAX)) {
+ 		const size_t size = offsetof(struct filename, iname[1]);
+ 		kname = (char *)result;
+ 
+ 		/*
+ 		 * size is chosen that way we to guarantee that
+ 		 * result->iname[0] is within the same object and that
+ 		 * kname can't be equal to result->iname, no matter what.
+ 		 */
+ 		result = kzalloc(size, GFP_KERNEL);
+ 		if (unlikely(!result)) {
+ 			__putname(kname);
+ 			return ERR_PTR(-ENOMEM);
+ 		}
+ 		result->name = kname;
+ 		len = strncpy_from_user(kname, filename, PATH_MAX);
+ 		if (unlikely(len < 0)) {
+ 			__putname(kname);
+ 			kfree(result);
+ 			return ERR_PTR(len);
+ 		}
+ 		if (unlikely(len == PATH_MAX)) {
+ 			__putname(kname);
+ 			kfree(result);
+ 			return ERR_PTR(-ENAMETOOLONG);
+ 		}
+ 	}
+ 
+ 	result->refcnt = 1;
+ 	/* The empty path is special. */
+ 	if (unlikely(!len)) {
+ 		if (empty)
+ 			*empty = 1;
+ 		if (!(flags & LOOKUP_EMPTY)) {
+ 			putname(result);
+ 			return ERR_PTR(-ENOENT);
+ 		}
+ 	}
+ 
+ 	result->uptr = filename;
+ 	result->aname = NULL;
+ 	audit_getname(result);
+ 	return result;
+ }
+ 
+ struct filename *
+ getname(const char __user * filename)
+ {
+ 	return getname_flags(filename, 0, NULL);
+ }
+ 
+ struct filename *
+ getname_kernel(const char * filename)
+ {
+ 	struct filename *result;
+ 	int len = strlen(filename) + 1;
+ 
+ 	result = __getname();
+ 	if (unlikely(!result))
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	if (len <= EMBEDDED_NAME_MAX) {
+ 		result->name = (char *)result->iname;
+ 	} else if (len <= PATH_MAX) {
+ 		const size_t size = offsetof(struct filename, iname[1]);
+ 		struct filename *tmp;
+ 
+ 		tmp = kmalloc(size, GFP_KERNEL);
+ 		if (unlikely(!tmp)) {
+ 			__putname(result);
+ 			return ERR_PTR(-ENOMEM);
+ 		}
+ 		tmp->name = (char *)result;
+ 		result = tmp;
+ 	} else {
+ 		__putname(result);
+ 		return ERR_PTR(-ENAMETOOLONG);
+ 	}
+ 	memcpy((char *)result->name, filename, len);
+ 	result->uptr = NULL;
+ 	result->aname = NULL;
+ 	result->refcnt = 1;
+ 	audit_getname(result);
+ 
+ 	return result;
+ }
+ 
+ void putname(struct filename *name)
+ {
+ 	BUG_ON(name->refcnt <= 0);
+ 
+ 	if (--name->refcnt > 0)
+ 		return;
+ 
+ 	if (name->name != name->iname) {
+ 		__putname(name->name);
+ 		kfree(name);
+ 	} else
+ 		__putname(name);
+ }
+ 
+ static int check_acl(struct inode *inode, int mask)
+ {
+ #ifdef CONFIG_FS_POSIX_ACL
+ 	struct posix_acl *acl;
+ 
+ 	if (mask & MAY_NOT_BLOCK) {
+ 		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
+ 	        if (!acl)
+ 	                return -EAGAIN;
+ 		/* no ->get_acl() calls in RCU mode... */
+ 		if (is_uncached_acl(acl))
+ 			return -ECHILD;
+ 	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
+ 	}
+ 
+ 	acl = get_acl(inode, ACL_TYPE_ACCESS);
+ 	if (IS_ERR(acl))
+ 		return PTR_ERR(acl);
+ 	if (acl) {
+ 	        int error = posix_acl_permission(inode, acl, mask);
+ 	        posix_acl_release(acl);
+ 	        return error;
+ 	}
+ #endif
+ 
+ 	return -EAGAIN;
+ }
+ 
+ /*
+  * This does the basic permission checking
+  */
+ static int acl_permission_check(struct inode *inode, int mask)
+ {
+ 	unsigned int mode = inode->i_mode;
+ 
+ 	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
+ 		mode >>= 6;
+ 	else {
+ 		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
+ 			int error = check_acl(inode, mask);
+ 			if (error != -EAGAIN)
+ 				return error;
+ 		}
+ 
+ 		if (in_group_p(inode->i_gid))
+ 			mode >>= 3;
+ 	}
+ 
+ 	/*
+ 	 * If the DACs are ok we don't need any capability check.
+ 	 */
+ 	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+ 		return 0;
+ 	return -EACCES;
+ }
+ 
+ /**
+  * generic_permission -  check for access rights on a Posix-like filesystem
+  * @inode:	inode to check access rights for
+  * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
+  *
+  * Used to check for read/write/execute permissions on a file.
+  * We use "fsuid" for this, letting us set arbitrary permissions
+  * for filesystem access without changing the "normal" uids which
+  * are used for other things.
+  *
+  * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
+  * request cannot be satisfied (eg. requires blocking or too much complexity).
+  * It would then be called again in ref-walk mode.
+  */
+ int generic_permission(struct inode *inode, int mask)
+ {
+ 	int ret;
+ 
+ 	/*
+ 	 * Do the basic permission checks.
+ 	 */
+ 	ret = acl_permission_check(inode, mask);
+ 	if (ret != -EACCES)
+ 		return ret;
+ 
+ 	if (S_ISDIR(inode->i_mode)) {
+ 		/* DACs are overridable for directories */
+ 		if (!(mask & MAY_WRITE))
+ 			if (capable_wrt_inode_uidgid(inode,
+ 						     CAP_DAC_READ_SEARCH))
+ 				return 0;
+ 		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+ 			return 0;
+ 		return -EACCES;
+ 	}
+ 
+ 	/*
+ 	 * Searching includes executable on directories, else just read.
+ 	 */
+ 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+ 	if (mask == MAY_READ)
+ 		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
+ 			return 0;
+ 	/*
+ 	 * Read/write DACs are always overridable.
+ 	 * Executable DACs are overridable when there is
+ 	 * at least one exec bit set.
+ 	 */
+ 	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
+ 		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+ 			return 0;
+ 
+ 	return -EACCES;
+ }
+ EXPORT_SYMBOL(generic_permission);
+ 
+ /*
+  * We _really_ want to just do "generic_permission()" without
+  * even looking at the inode->i_op values. So we keep a cache
+  * flag in inode->i_opflags, that says "this has not special
+  * permission function, use the fast case".
+  */
+ static inline int do_inode_permission(struct inode *inode, int mask)
+ {
+ 	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+ 		if (likely(inode->i_op->permission))
+ 			return inode->i_op->permission(inode, mask);
+ 
+ 		/* This gets set once for the inode lifetime */
+ 		spin_lock(&inode->i_lock);
+ 		inode->i_opflags |= IOP_FASTPERM;
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ 	return generic_permission(inode, mask);
+ }
+ 
+ /**
+  * sb_permission - Check superblock-level permissions
+  * @sb: Superblock of inode to check permission on
+  * @inode: Inode to check permission on
+  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+  *
+  * Separate out file-system wide checks from inode-specific permission checks.
+  */
+ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
+ {
+ 	if (unlikely(mask & MAY_WRITE)) {
+ 		umode_t mode = inode->i_mode;
+ 
+ 		/* Nobody gets write access to a read-only fs. */
+ 		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ 			return -EROFS;
+ 	}
+ 	return 0;
+ }
+ 
+ /**
+  * inode_permission - Check for access rights to a given inode
+  * @inode: Inode to check permission on
+  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+  *
+  * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
+  * this, letting us set arbitrary permissions for filesystem access without
+  * changing the "normal" UIDs which are used for other things.
+  *
+  * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
+  */
+ int inode_permission(struct inode *inode, int mask)
+ {
+ 	int retval;
+ 
+ 	retval = sb_permission(inode->i_sb, inode, mask);
+ 	if (retval)
+ 		return retval;
+ 
+ 	if (unlikely(mask & MAY_WRITE)) {
+ 		/*
+ 		 * Nobody gets write access to an immutable file.
+ 		 */
+ 		if (IS_IMMUTABLE(inode))
+ 			return -EPERM;
+ 
+ 		/*
+ 		 * Updating mtime will likely cause i_uid and i_gid to be
+ 		 * written back improperly if their true value is unknown
+ 		 * to the vfs.
+ 		 */
+ 		if (HAS_UNMAPPED_ID(inode))
+ 			return -EACCES;
+ 	}
+ 
+ 	retval = do_inode_permission(inode, mask);
+ 	if (retval)
+ 		return retval;
+ 
+ 	retval = devcgroup_inode_permission(inode, mask);
+ 	if (retval)
+ 		return retval;
+ 
+ 	return security_inode_permission(inode, mask);
+ }
+ EXPORT_SYMBOL(inode_permission);
+ 
+ /**
+  * path_get - get a reference to a path
+  * @path: path to get the reference to
+  *
+  * Given a path increment the reference count to the dentry and the vfsmount.
+  */
+ void path_get(const struct path *path)
+ {
+ 	mntget(path->mnt);
+ 	dget(path->dentry);
+ }
+ EXPORT_SYMBOL(path_get);
+ 
+ /**
+  * path_put - put a reference to a path
+  * @path: path to put the reference to
+  *
+  * Given a path decrement the reference count to the dentry and the vfsmount.
+  */
+ void path_put(const struct path *path)
+ {
+ 	dput(path->dentry);
+ 	mntput(path->mnt);
+ }
+ EXPORT_SYMBOL(path_put);
+ 
+ #define EMBEDDED_LEVELS 2
+ struct nameidata {
+ 	struct path	path;
+ 	struct qstr	last;
+ 	struct path	root;
+ 	struct inode	*inode; /* path.dentry.d_inode */
+ 	unsigned int	flags;
+ 	unsigned	seq, m_seq;
+ 	int		last_type;
+ 	unsigned	depth;
+ 	int		total_link_count;
+ 	struct saved {
+ 		struct path link;
+ 		struct delayed_call done;
+ 		const char *name;
+ 		unsigned seq;
+ 	} *stack, internal[EMBEDDED_LEVELS];
+ 	struct filename	*name;
+ 	struct nameidata *saved;
+ 	struct inode	*link_inode;
+ 	unsigned	root_seq;
+ 	int		dfd;
+ } __randomize_layout;
+ 
+ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
+ {
+ 	struct nameidata *old = current->nameidata;
+ 	p->stack = p->internal;
+ 	p->dfd = dfd;
+ 	p->name = name;
+ 	p->total_link_count = old ? old->total_link_count : 0;
+ 	p->saved = old;
+ 	current->nameidata = p;
+ }
+ 
+ static void restore_nameidata(void)
+ {
+ 	struct nameidata *now = current->nameidata, *old = now->saved;
+ 
+ 	current->nameidata = old;
+ 	if (old)
+ 		old->total_link_count = now->total_link_count;
+ 	if (now->stack != now->internal)
+ 		kfree(now->stack);
+ }
+ 
+ static int __nd_alloc_stack(struct nameidata *nd)
+ {
+ 	struct saved *p;
+ 
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
+ 				  GFP_ATOMIC);
+ 		if (unlikely(!p))
+ 			return -ECHILD;
+ 	} else {
+ 		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
+ 				  GFP_KERNEL);
+ 		if (unlikely(!p))
+ 			return -ENOMEM;
+ 	}
+ 	memcpy(p, nd->internal, sizeof(nd->internal));
+ 	nd->stack = p;
+ 	return 0;
+ }
+ 
+ /**
+  * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
+  * @path: nameidate to verify
+  *
+  * Rename can sometimes move a file or directory outside of a bind
+  * mount, path_connected allows those cases to be detected.
+  */
+ static bool path_connected(const struct path *path)
+ {
+ 	struct vfsmount *mnt = path->mnt;
+ 	struct super_block *sb = mnt->mnt_sb;
+ 
+ 	/* Bind mounts and multi-root filesystems can have disconnected paths */
+ 	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
+ 		return true;
+ 
+ 	return is_subdir(path->dentry, mnt->mnt_root);
+ }
+ 
+ static inline int nd_alloc_stack(struct nameidata *nd)
+ {
+ 	if (likely(nd->depth != EMBEDDED_LEVELS))
+ 		return 0;
+ 	if (likely(nd->stack != nd->internal))
+ 		return 0;
+ 	return __nd_alloc_stack(nd);
+ }
+ 
+ static void drop_links(struct nameidata *nd)
+ {
+ 	int i = nd->depth;
+ 	while (i--) {
+ 		struct saved *last = nd->stack + i;
+ 		do_delayed_call(&last->done);
+ 		clear_delayed_call(&last->done);
+ 	}
+ }
+ 
+ static void terminate_walk(struct nameidata *nd)
+ {
+ 	drop_links(nd);
+ 	if (!(nd->flags & LOOKUP_RCU)) {
+ 		int i;
+ 		path_put(&nd->path);
+ 		for (i = 0; i < nd->depth; i++)
+ 			path_put(&nd->stack[i].link);
+ 		if (nd->flags & LOOKUP_ROOT_GRABBED) {
+ 			path_put(&nd->root);
+ 			nd->flags &= ~LOOKUP_ROOT_GRABBED;
+ 		}
+ 	} else {
+ 		nd->flags &= ~LOOKUP_RCU;
+ 		rcu_read_unlock();
+ 	}
+ 	nd->depth = 0;
+ }
+ 
+ /* path_put is needed afterwards regardless of success or failure */
+ static bool legitimize_path(struct nameidata *nd,
+ 			    struct path *path, unsigned seq)
+ {
+ 	int res = __legitimize_mnt(path->mnt, nd->m_seq);
+ 	if (unlikely(res)) {
+ 		if (res > 0)
+ 			path->mnt = NULL;
+ 		path->dentry = NULL;
+ 		return false;
+ 	}
+ 	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
+ 		path->dentry = NULL;
+ 		return false;
+ 	}
+ 	return !read_seqcount_retry(&path->dentry->d_seq, seq);
+ }
+ 
+ static bool legitimize_links(struct nameidata *nd)
+ {
+ 	int i;
+ 	for (i = 0; i < nd->depth; i++) {
+ 		struct saved *last = nd->stack + i;
+ 		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
+ 			drop_links(nd);
+ 			nd->depth = i + 1;
+ 			return false;
+ 		}
+ 	}
+ 	return true;
+ }
+ 
+ static bool legitimize_root(struct nameidata *nd)
+ {
+ 	if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT))
+ 		return true;
+ 	nd->flags |= LOOKUP_ROOT_GRABBED;
+ 	return legitimize_path(nd, &nd->root, nd->root_seq);
+ }
+ 
+ /*
+  * Path walking has 2 modes, rcu-walk and ref-walk (see
+  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
+  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+  * normal reference counts on dentries and vfsmounts to transition to ref-walk
+  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
+  * got stuck, so ref-walk may continue from there. If this is not successful
+  * (eg. a seqcount has changed), then failure is returned and it's up to caller
+  * to restart the path walk from the beginning in ref-walk mode.
+  */
+ 
+ /**
+  * unlazy_walk - try to switch to ref-walk mode.
+  * @nd: nameidata pathwalk data
+  * Returns: 0 on success, -ECHILD on failure
+  *
+  * unlazy_walk attempts to legitimize the current nd->path and nd->root
+  * for ref-walk mode.
+  * Must be called from rcu-walk context.
+  * Nothing should touch nameidata between unlazy_walk() failure and
+  * terminate_walk().
+  */
+ static int unlazy_walk(struct nameidata *nd)
+ {
+ 	struct dentry *parent = nd->path.dentry;
+ 
+ 	BUG_ON(!(nd->flags & LOOKUP_RCU));
+ 
+ 	nd->flags &= ~LOOKUP_RCU;
+ 	if (unlikely(!legitimize_links(nd)))
+ 		goto out1;
+ 	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
+ 		goto out;
+ 	if (unlikely(!legitimize_root(nd)))
+ 		goto out;
+ 	rcu_read_unlock();
+ 	BUG_ON(nd->inode != parent->d_inode);
+ 	return 0;
+ 
+ out1:
+ 	nd->path.mnt = NULL;
+ 	nd->path.dentry = NULL;
+ out:
+ 	rcu_read_unlock();
+ 	return -ECHILD;
+ }
+ 
+ /**
+  * unlazy_child - try to switch to ref-walk mode.
+  * @nd: nameidata pathwalk data
+  * @dentry: child of nd->path.dentry
+  * @seq: seq number to check dentry against
+  * Returns: 0 on success, -ECHILD on failure
+  *
+  * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
+  * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+  * @nd.  Must be called from rcu-walk context.
+  * Nothing should touch nameidata between unlazy_child() failure and
+  * terminate_walk().
+  */
+ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+ {
+ 	BUG_ON(!(nd->flags & LOOKUP_RCU));
+ 
+ 	nd->flags &= ~LOOKUP_RCU;
+ 	if (unlikely(!legitimize_links(nd)))
+ 		goto out2;
+ 	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
+ 		goto out2;
+ 	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
+ 		goto out1;
+ 
+ 	/*
+ 	 * We need to move both the parent and the dentry from the RCU domain
+ 	 * to be properly refcounted. And the sequence number in the dentry
+ 	 * validates *both* dentry counters, since we checked the sequence
+ 	 * number of the parent after we got the child sequence number. So we
+ 	 * know the parent must still be valid if the child sequence number is
+ 	 */
+ 	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
+ 		goto out;
+ 	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
+ 		goto out_dput;
+ 	/*
+ 	 * Sequence counts matched. Now make sure that the root is
+ 	 * still valid and get it if required.
+ 	 */
+ 	if (unlikely(!legitimize_root(nd)))
+ 		goto out_dput;
+ 	rcu_read_unlock();
+ 	return 0;
+ 
+ out2:
+ 	nd->path.mnt = NULL;
+ out1:
+ 	nd->path.dentry = NULL;
+ out:
+ 	rcu_read_unlock();
+ 	return -ECHILD;
+ out_dput:
+ 	rcu_read_unlock();
+ 	dput(dentry);
+ 	return -ECHILD;
+ }
+ 
+ static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
+ {
+ 	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+ 		return dentry->d_op->d_revalidate(dentry, flags);
+ 	else
+ 		return 1;
+ }
+ 
+ /**
+  * complete_walk - successful completion of path walk
+  * @nd:  pointer nameidata
+  *
+  * If we had been in RCU mode, drop out of it and legitimize nd->path.
+  * Revalidate the final result, unless we'd already done that during
+  * the path walk or the filesystem doesn't ask for it.  Return 0 on
+  * success, -error on failure.  In case of failure caller does not
+  * need to drop nd->path.
+  */
+ static int complete_walk(struct nameidata *nd)
+ {
+ 	struct dentry *dentry = nd->path.dentry;
+ 	int status;
+ 
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		if (!(nd->flags & LOOKUP_ROOT))
+ 			nd->root.mnt = NULL;
+ 		if (unlikely(unlazy_walk(nd)))
+ 			return -ECHILD;
+ 	}
+ 
+ 	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+ 		return 0;
+ 
+ 	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
+ 		return 0;
+ 
+ 	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
+ 	if (status > 0)
+ 		return 0;
+ 
+ 	if (!status)
+ 		status = -ESTALE;
+ 
+ 	return status;
+ }
+ 
+ static void set_root(struct nameidata *nd)
+ {
+ 	struct fs_struct *fs = current->fs;
+ 
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		unsigned seq;
+ 
+ 		do {
+ 			seq = read_seqcount_begin(&fs->seq);
+ 			nd->root = fs->root;
+ 			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ 		} while (read_seqcount_retry(&fs->seq, seq));
+ 	} else {
+ 		get_fs_root(fs, &nd->root);
+ 		nd->flags |= LOOKUP_ROOT_GRABBED;
+ 	}
+ }
+ 
+ static void path_put_conditional(struct path *path, struct nameidata *nd)
+ {
+ 	dput(path->dentry);
+ 	if (path->mnt != nd->path.mnt)
+ 		mntput(path->mnt);
+ }
+ 
+ static inline void path_to_nameidata(const struct path *path,
+ 					struct nameidata *nd)
+ {
+ 	if (!(nd->flags & LOOKUP_RCU)) {
+ 		dput(nd->path.dentry);
+ 		if (nd->path.mnt != path->mnt)
+ 			mntput(nd->path.mnt);
+ 	}
+ 	nd->path.mnt = path->mnt;
+ 	nd->path.dentry = path->dentry;
+ }
+ 
+ static int nd_jump_root(struct nameidata *nd)
+ {
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		struct dentry *d;
+ 		nd->path = nd->root;
+ 		d = nd->path.dentry;
+ 		nd->inode = d->d_inode;
+ 		nd->seq = nd->root_seq;
+ 		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+ 			return -ECHILD;
+ 	} else {
+ 		path_put(&nd->path);
+ 		nd->path = nd->root;
+ 		path_get(&nd->path);
+ 		nd->inode = nd->path.dentry->d_inode;
+ 	}
+ 	nd->flags |= LOOKUP_JUMPED;
+ 	return 0;
+ }
+ 
+ /*
+  * Helper to directly jump to a known parsed path from ->get_link,
+  * caller must have taken a reference to path beforehand.
+  */
+ void nd_jump_link(struct path *path)
+ {
+ 	struct nameidata *nd = current->nameidata;
+ 	path_put(&nd->path);
+ 
+ 	nd->path = *path;
+ 	nd->inode = nd->path.dentry->d_inode;
+ 	nd->flags |= LOOKUP_JUMPED;
+ }
+ 
+ static inline void put_link(struct nameidata *nd)
+ {
+ 	struct saved *last = nd->stack + --nd->depth;
+ 	do_delayed_call(&last->done);
+ 	if (!(nd->flags & LOOKUP_RCU))
+ 		path_put(&last->link);
+ }
+ 
+ int sysctl_protected_symlinks __read_mostly = 1;
+ int sysctl_protected_hardlinks __read_mostly = 1;
+ int sysctl_protected_fifos __read_mostly = 2;
+ int sysctl_protected_regular __read_mostly = 2;
+ 
+ /**
+  * may_follow_link - Check symlink following for unsafe situations
+  * @nd: nameidata pathwalk data
+  *
+  * In the case of the sysctl_protected_symlinks sysctl being enabled,
+  * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
+  * in a sticky world-writable directory. This is to protect privileged
+  * processes from failing races against path names that may change out
+  * from under them by way of other users creating malicious symlinks.
+  * It will permit symlinks to be followed only when outside a sticky
+  * world-writable directory, or when the uid of the symlink and follower
+  * match, or when the directory owner matches the symlink's owner.
+  *
+  * Returns 0 if following the symlink is allowed, -ve on error.
+  */
+ static inline int may_follow_link(struct nameidata *nd)
+ {
+ 	const struct inode *inode;
+ 	const struct inode *parent;
+ 	kuid_t puid;
+ 
+ 	if (!sysctl_protected_symlinks)
+ 		return 0;
+ 
+ 	/* Allowed if owner and follower match. */
+ 	inode = nd->link_inode;
+ 	if (uid_eq(current_cred()->fsuid, inode->i_uid))
+ 		return 0;
+ 
+ 	/* Allowed if parent directory not sticky and world-writable. */
+ 	parent = nd->inode;
+ 	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
+ 		return 0;
+ 
+ 	/* Allowed if parent directory and link owner match. */
+ 	puid = parent->i_uid;
+ 	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
+ 		return 0;
+ 
+ 	if (nd->flags & LOOKUP_RCU)
+ 		return -ECHILD;
+ 
+ 	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
+ 	audit_log_link_denied("follow_link");
+ 	return -EACCES;
+ }
+ 
+ /**
+  * safe_hardlink_source - Check for safe hardlink conditions
+  * @inode: the source inode to hardlink from
+  *
+  * Return false if at least one of the following conditions:
+  *    - inode is not a regular file
+  *    - inode is setuid
+  *    - inode is setgid and group-exec
+  *    - access failure for read and write
+  *
+  * Otherwise returns true.
+  */
+ static bool safe_hardlink_source(struct inode *inode)
+ {
+ 	umode_t mode = inode->i_mode;
+ 
+ 	/* Special files should not get pinned to the filesystem. */
+ 	if (!S_ISREG(mode))
+ 		return false;
+ 
+ 	/* Setuid files should not get pinned to the filesystem. */
+ 	if (mode & S_ISUID)
+ 		return false;
+ 
+ 	/* Executable setgid files should not get pinned to the filesystem. */
+ 	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
+ 		return false;
+ 
+ 	/* Hardlinking to unreadable or unwritable sources is dangerous. */
+ 	if (inode_permission(inode, MAY_READ | MAY_WRITE))
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ /**
+  * may_linkat - Check permissions for creating a hardlink
+  * @link: the source to hardlink from
+  *
+  * Block hardlink when all of:
+  *  - sysctl_protected_hardlinks enabled
+  *  - fsuid does not match inode
+  *  - hardlink source is unsafe (see safe_hardlink_source() above)
+  *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
+  *
+  * Returns 0 if successful, -ve on error.
+  */
+ static int may_linkat(struct path *link)
+ {
+ 	struct inode *inode = link->dentry->d_inode;
+ 
+ 	/* Inode writeback is not safe when the uid or gid are invalid. */
+ 	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ 		return -EOVERFLOW;
+ 
+ 	if (!sysctl_protected_hardlinks)
+ 		return 0;
+ 
+ 	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
+ 	 * otherwise, it must be a safe source.
+ 	 */
+ 	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
+ 		return 0;
+ 
+ 	audit_log_link_denied("linkat");
+ 	return -EPERM;
+ }
+ 
+ /**
+  * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
+  *			  should be allowed, or not, on files that already
+  *			  exist.
+  * @dir_mode: mode bits of directory
+  * @dir_uid: owner of directory
+  * @inode: the inode of the file to open
+  *
+  * Block an O_CREAT open of a FIFO (or a regular file) when:
+  *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
+  *   - the file already exists
+  *   - we are in a sticky directory
+  *   - we don't own the file
+  *   - the owner of the directory doesn't own the file
+  *   - the directory is world writable
+  * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
+  * the directory doesn't have to be world writable: being group writable will
+  * be enough.
+  *
+  * Returns 0 if the open is allowed, -ve on error.
+  */
+ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
+ 				struct inode * const inode)
+ {
+ 	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
+ 	    (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
+ 	    likely(!(dir_mode & S_ISVTX)) ||
+ 	    uid_eq(inode->i_uid, dir_uid) ||
+ 	    uid_eq(current_fsuid(), inode->i_uid))
+ 		return 0;
+ 
+ 	if (likely(dir_mode & 0002) ||
+ 	    (dir_mode & 0020 &&
+ 	     ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
+ 	      (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
+ 		return -EACCES;
+ 	}
+ 	return 0;
+ }
+ 
+ static __always_inline
+ const char *get_link(struct nameidata *nd)
+ {
+ 	struct saved *last = nd->stack + nd->depth - 1;
+ 	struct dentry *dentry = last->link.dentry;
+ 	struct inode *inode = nd->link_inode;
+ 	int error;
+ 	const char *res;
+ 
+ 	if (!(nd->flags & LOOKUP_RCU)) {
+ 		touch_atime(&last->link);
+ 		cond_resched();
+ 	} else if (atime_needs_update(&last->link, inode)) {
+ 		if (unlikely(unlazy_walk(nd)))
+ 			return ERR_PTR(-ECHILD);
+ 		touch_atime(&last->link);
+ 	}
+ 
+ 	error = security_inode_follow_link(dentry, inode,
+ 					   nd->flags & LOOKUP_RCU);
+ 	if (unlikely(error))
+ 		return ERR_PTR(error);
+ 
+ 	nd->last_type = LAST_BIND;
+ 	res = READ_ONCE(inode->i_link);
+ 	if (!res) {
+ 		const char * (*get)(struct dentry *, struct inode *,
+ 				struct delayed_call *);
+ 		get = inode->i_op->get_link;
+ 		if (nd->flags & LOOKUP_RCU) {
+ 			res = get(NULL, inode, &last->done);
+ 			if (res == ERR_PTR(-ECHILD)) {
+ 				if (unlikely(unlazy_walk(nd)))
+ 					return ERR_PTR(-ECHILD);
+ 				res = get(dentry, inode, &last->done);
+ 			}
+ 		} else {
+ 			res = get(dentry, inode, &last->done);
+ 		}
+ 		if (IS_ERR_OR_NULL(res))
+ 			return res;
+ 	}
+ 	if (*res == '/') {
+ 		if (!nd->root.mnt)
+ 			set_root(nd);
+ 		if (unlikely(nd_jump_root(nd)))
+ 			return ERR_PTR(-ECHILD);
+ 		while (unlikely(*++res == '/'))
+ 			;
+ 	}
+ 	if (!*res)
+ 		res = NULL;
+ 	return res;
+ }
+ 
+ /*
+  * follow_up - Find the mountpoint of path's vfsmount
+  *
+  * Given a path, find the mountpoint of its source file system.
+  * Replace @path with the path of the mountpoint in the parent mount.
+  * Up is towards /.
+  *
+  * Return 1 if we went up a level and 0 if we were already at the
+  * root.
+  */
+ int follow_up(struct path *path)
+ {
+ 	struct mount *mnt = real_mount(path->mnt);
+ 	struct mount *parent;
+ 	struct dentry *mountpoint;
+ 
+ 	read_seqlock_excl(&mount_lock);
+ 	parent = mnt->mnt_parent;
+ 	if (parent == mnt) {
+ 		read_sequnlock_excl(&mount_lock);
+ 		return 0;
+ 	}
+ 	mntget(&parent->mnt);
+ 	mountpoint = dget(mnt->mnt_mountpoint);
+ 	read_sequnlock_excl(&mount_lock);
+ 	dput(path->dentry);
+ 	path->dentry = mountpoint;
+ 	mntput(path->mnt);
+ 	path->mnt = &parent->mnt;
+ 	return 1;
+ }
+ EXPORT_SYMBOL(follow_up);
+ 
+ /*
+  * Perform an automount
+  * - return -EISDIR to tell follow_managed() to stop and return the path we
+  *   were called with.
+  */
+ static int follow_automount(struct path *path, struct nameidata *nd,
+ 			    bool *need_mntput)
+ {
+ 	struct vfsmount *mnt;
+ 	int err;
+ 
+ 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
+ 		return -EREMOTE;
+ 
+ 	/* We don't want to mount if someone's just doing a stat -
+ 	 * unless they're stat'ing a directory and appended a '/' to
+ 	 * the name.
+ 	 *
+ 	 * We do, however, want to mount if someone wants to open or
+ 	 * create a file of any type under the mountpoint, wants to
+ 	 * traverse through the mountpoint or wants to open the
+ 	 * mounted directory.  Also, autofs may mark negative dentries
+ 	 * as being automount points.  These will need the attentions
+ 	 * of the daemon to instantiate them before they can be used.
+ 	 */
+ 	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+ 			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+ 	    path->dentry->d_inode)
+ 		return -EISDIR;
+ 
+ 	nd->total_link_count++;
+ 	if (nd->total_link_count >= 40)
+ 		return -ELOOP;
+ 
+ 	mnt = path->dentry->d_op->d_automount(path);
+ 	if (IS_ERR(mnt)) {
+ 		/*
+ 		 * The filesystem is allowed to return -EISDIR here to indicate
+ 		 * it doesn't want to automount.  For instance, autofs would do
+ 		 * this so that its userspace daemon can mount on this dentry.
+ 		 *
+ 		 * However, we can only permit this if it's a terminal point in
+ 		 * the path being looked up; if it wasn't then the remainder of
+ 		 * the path is inaccessible and we should say so.
+ 		 */
+ 		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
+ 			return -EREMOTE;
+ 		return PTR_ERR(mnt);
+ 	}
+ 
+ 	if (!mnt) /* mount collision */
+ 		return 0;
+ 
+ 	if (!*need_mntput) {
+ 		/* lock_mount() may release path->mnt on error */
+ 		mntget(path->mnt);
+ 		*need_mntput = true;
+ 	}
+ 	err = finish_automount(mnt, path);
+ 
+ 	switch (err) {
+ 	case -EBUSY:
+ 		/* Someone else made a mount here whilst we were busy */
+ 		return 0;
+ 	case 0:
+ 		path_put(path);
+ 		path->mnt = mnt;
+ 		path->dentry = dget(mnt->mnt_root);
+ 		return 0;
+ 	default:
+ 		return err;
+ 	}
+ 
+ }
+ 
+ /*
+  * Handle a dentry that is managed in some way.
+  * - Flagged for transit management (autofs)
+  * - Flagged as mountpoint
+  * - Flagged as automount point
+  *
+  * This may only be called in refwalk mode.
+  *
+  * Serialization is taken care of in namespace.c
+  */
+ static int follow_managed(struct path *path, struct nameidata *nd)
+ {
+ 	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
+ 	unsigned managed;
+ 	bool need_mntput = false;
+ 	int ret = 0;
+ 
+ 	/* Given that we're not holding a lock here, we retain the value in a
+ 	 * local variable for each dentry as we look at it so that we don't see
+ 	 * the components of that value change under us */
+ 	while (managed = READ_ONCE(path->dentry->d_flags),
+ 	       managed &= DCACHE_MANAGED_DENTRY,
+ 	       unlikely(managed != 0)) {
+ 		/* Allow the filesystem to manage the transit without i_mutex
+ 		 * being held. */
+ 		if (managed & DCACHE_MANAGE_TRANSIT) {
+ 			BUG_ON(!path->dentry->d_op);
+ 			BUG_ON(!path->dentry->d_op->d_manage);
+ 			ret = path->dentry->d_op->d_manage(path, false);
+ 			if (ret < 0)
+ 				break;
+ 		}
+ 
+ 		/* Transit to a mounted filesystem. */
+ 		if (managed & DCACHE_MOUNTED) {
+ 			struct vfsmount *mounted = lookup_mnt(path);
+ 			if (mounted) {
+ 				dput(path->dentry);
+ 				if (need_mntput)
+ 					mntput(path->mnt);
+ 				path->mnt = mounted;
+ 				path->dentry = dget(mounted->mnt_root);
+ 				need_mntput = true;
+ 				continue;
+ 			}
+ 
+ 			/* Something is mounted on this dentry in another
+ 			 * namespace and/or whatever was mounted there in this
+ 			 * namespace got unmounted before lookup_mnt() could
+ 			 * get it */
+ 		}
+ 
+ 		/* Handle an automount point */
+ 		if (managed & DCACHE_NEED_AUTOMOUNT) {
+ 			ret = follow_automount(path, nd, &need_mntput);
+ 			if (ret < 0)
+ 				break;
+ 			continue;
+ 		}
+ 
+ 		/* We didn't change the current path point */
+ 		break;
+ 	}
+ 
+ 	if (need_mntput && path->mnt == mnt)
+ 		mntput(path->mnt);
+ 	if (ret == -EISDIR || !ret)
+ 		ret = 1;
+ 	if (need_mntput)
+ 		nd->flags |= LOOKUP_JUMPED;
+ 	if (unlikely(ret < 0))
+ 		path_put_conditional(path, nd);
+ 	return ret;
+ }
+ 
+ int follow_down_one(struct path *path)
+ {
+ 	struct vfsmount *mounted;
+ 
+ 	mounted = lookup_mnt(path);
+ 	if (mounted) {
+ 		dput(path->dentry);
+ 		mntput(path->mnt);
+ 		path->mnt = mounted;
+ 		path->dentry = dget(mounted->mnt_root);
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL(follow_down_one);
+ 
+ static inline int managed_dentry_rcu(const struct path *path)
+ {
+ 	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+ 		path->dentry->d_op->d_manage(path, true) : 0;
+ }
+ 
+ /*
+  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
+  * we meet a managed dentry that would need blocking.
+  */
+ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+ 			       struct inode **inode, unsigned *seqp)
+ {
+ 	for (;;) {
+ 		struct mount *mounted;
+ 		/*
+ 		 * Don't forget we might have a non-mountpoint managed dentry
+ 		 * that wants to block transit.
+ 		 */
+ 		switch (managed_dentry_rcu(path)) {
+ 		case -ECHILD:
+ 		default:
+ 			return false;
+ 		case -EISDIR:
+ 			return true;
+ 		case 0:
+ 			break;
+ 		}
+ 
+ 		if (!d_mountpoint(path->dentry))
+ 			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
+ 
+ 		mounted = __lookup_mnt(path->mnt, path->dentry);
+ 		if (!mounted)
+ 			break;
+ 		path->mnt = &mounted->mnt;
+ 		path->dentry = mounted->mnt.mnt_root;
+ 		nd->flags |= LOOKUP_JUMPED;
+ 		*seqp = read_seqcount_begin(&path->dentry->d_seq);
+ 		/*
+ 		 * Update the inode too. We don't need to re-check the
+ 		 * dentry sequence number here after this d_inode read,
+ 		 * because a mount-point is always pinned.
+ 		 */
+ 		*inode = path->dentry->d_inode;
+ 	}
+ 	return !read_seqretry(&mount_lock, nd->m_seq) &&
+ 		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
+ }
+ 
+ static int follow_dotdot_rcu(struct nameidata *nd)
+ {
+ 	struct inode *inode = nd->inode;
+ 
+ 	while (1) {
+ 		if (path_equal(&nd->path, &nd->root))
+ 			break;
+ 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+ 			struct dentry *old = nd->path.dentry;
+ 			struct dentry *parent = old->d_parent;
+ 			unsigned seq;
+ 
+ 			inode = parent->d_inode;
+ 			seq = read_seqcount_begin(&parent->d_seq);
+ 			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+ 				return -ECHILD;
+ 			nd->path.dentry = parent;
+ 			nd->seq = seq;
+ 			if (unlikely(!path_connected(&nd->path)))
+ 				return -ECHILD;
+ 			break;
+ 		} else {
+ 			struct mount *mnt = real_mount(nd->path.mnt);
+ 			struct mount *mparent = mnt->mnt_parent;
+ 			struct dentry *mountpoint = mnt->mnt_mountpoint;
+ 			struct inode *inode2 = mountpoint->d_inode;
+ 			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
+ 			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ 				return -ECHILD;
+ 			if (&mparent->mnt == nd->path.mnt)
+ 				break;
+ 			/* we know that mountpoint was pinned */
+ 			nd->path.dentry = mountpoint;
+ 			nd->path.mnt = &mparent->mnt;
+ 			inode = inode2;
+ 			nd->seq = seq;
+ 		}
+ 	}
+ 	while (unlikely(d_mountpoint(nd->path.dentry))) {
+ 		struct mount *mounted;
+ 		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+ 		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ 			return -ECHILD;
+ 		if (!mounted)
+ 			break;
+ 		nd->path.mnt = &mounted->mnt;
+ 		nd->path.dentry = mounted->mnt.mnt_root;
+ 		inode = nd->path.dentry->d_inode;
+ 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+ 	}
+ 	nd->inode = inode;
+ 	return 0;
+ }
+ 
+ /*
+  * Follow down to the covering mount currently visible to userspace.  At each
+  * point, the filesystem owning that dentry may be queried as to whether the
+  * caller is permitted to proceed or not.
+  */
+ int follow_down(struct path *path)
+ {
+ 	unsigned managed;
+ 	int ret;
+ 
+ 	while (managed = READ_ONCE(path->dentry->d_flags),
+ 	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
+ 		/* Allow the filesystem to manage the transit without i_mutex
+ 		 * being held.
+ 		 *
+ 		 * We indicate to the filesystem if someone is trying to mount
+ 		 * something here.  This gives autofs the chance to deny anyone
+ 		 * other than its daemon the right to mount on its
+ 		 * superstructure.
+ 		 *
+ 		 * The filesystem may sleep at this point.
+ 		 */
+ 		if (managed & DCACHE_MANAGE_TRANSIT) {
+ 			BUG_ON(!path->dentry->d_op);
+ 			BUG_ON(!path->dentry->d_op->d_manage);
+ 			ret = path->dentry->d_op->d_manage(path, false);
+ 			if (ret < 0)
+ 				return ret == -EISDIR ? 0 : ret;
+ 		}
+ 
+ 		/* Transit to a mounted filesystem. */
+ 		if (managed & DCACHE_MOUNTED) {
+ 			struct vfsmount *mounted = lookup_mnt(path);
+ 			if (!mounted)
+ 				break;
+ 			dput(path->dentry);
+ 			mntput(path->mnt);
+ 			path->mnt = mounted;
+ 			path->dentry = dget(mounted->mnt_root);
+ 			continue;
+ 		}
+ 
+ 		/* Don't handle automount points here */
+ 		break;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL(follow_down);
+ 
+ /*
+  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
+  */
+ static void follow_mount(struct path *path)
+ {
+ 	while (d_mountpoint(path->dentry)) {
+ 		struct vfsmount *mounted = lookup_mnt(path);
+ 		if (!mounted)
+ 			break;
+ 		dput(path->dentry);
+ 		mntput(path->mnt);
+ 		path->mnt = mounted;
+ 		path->dentry = dget(mounted->mnt_root);
+ 	}
+ }
+ 
+ static int path_parent_directory(struct path *path)
+ {
+ 	struct dentry *old = path->dentry;
+ 	/* rare case of legitimate dget_parent()... */
+ 	path->dentry = dget_parent(path->dentry);
+ 	dput(old);
+ 	if (unlikely(!path_connected(path)))
+ 		return -ENOENT;
+ 	return 0;
+ }
+ 
+ static int follow_dotdot(struct nameidata *nd)
+ {
+ 	while(1) {
+ 		if (path_equal(&nd->path, &nd->root))
+ 			break;
+ 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+ 			int ret = path_parent_directory(&nd->path);
+ 			if (ret)
+ 				return ret;
+ 			break;
+ 		}
+ 		if (!follow_up(&nd->path))
+ 			break;
+ 	}
+ 	follow_mount(&nd->path);
+ 	nd->inode = nd->path.dentry->d_inode;
+ 	return 0;
+ }
+ 
+ /*
+  * This looks up the name in dcache and possibly revalidates the found dentry.
+  * NULL is returned if the dentry does not exist in the cache.
+  */
+ static struct dentry *lookup_dcache(const struct qstr *name,
+ 				    struct dentry *dir,
+ 				    unsigned int flags)
+ {
+ 	struct dentry *dentry = d_lookup(dir, name);
+ 	if (dentry) {
+ 		int error = d_revalidate(dentry, flags);
+ 		if (unlikely(error <= 0)) {
+ 			if (!error)
+ 				d_invalidate(dentry);
+ 			dput(dentry);
+ 			return ERR_PTR(error);
+ 		}
+ 	}
+ 	return dentry;
+ }
+ 
+ /*
+  * Parent directory has inode locked exclusive.  This is one
+  * and only case when ->lookup() gets called on non in-lookup
+  * dentries - as the matter of fact, this only gets called
+  * when directory is guaranteed to have no in-lookup children
+  * at all.
+  */
+ static struct dentry *__lookup_hash(const struct qstr *name,
+ 		struct dentry *base, unsigned int flags)
+ {
+ 	struct dentry *dentry = lookup_dcache(name, base, flags);
+ 	struct dentry *old;
+ 	struct inode *dir = base->d_inode;
+ 
+ 	if (dentry)
+ 		return dentry;
+ 
+ 	/* Don't create child dentry for a dead directory. */
+ 	if (unlikely(IS_DEADDIR(dir)))
+ 		return ERR_PTR(-ENOENT);
+ 
+ 	dentry = d_alloc(base, name);
+ 	if (unlikely(!dentry))
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	old = dir->i_op->lookup(dir, dentry, flags);
+ 	if (unlikely(old)) {
+ 		dput(dentry);
+ 		dentry = old;
+ 	}
+ 	return dentry;
+ }
+ 
+ static int lookup_fast(struct nameidata *nd,
+ 		       struct path *path, struct inode **inode,
+ 		       unsigned *seqp)
+ {
+ 	struct vfsmount *mnt = nd->path.mnt;
+ 	struct dentry *dentry, *parent = nd->path.dentry;
+ 	int status = 1;
+ 	int err;
+ 
+ 	/*
+ 	 * Rename seqlock is not required here because in the off chance
+ 	 * of a false negative due to a concurrent rename, the caller is
+ 	 * going to fall back to non-racy lookup.
+ 	 */
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		unsigned seq;
+ 		bool negative;
+ 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
+ 		if (unlikely(!dentry)) {
+ 			if (unlazy_walk(nd))
+ 				return -ECHILD;
+ 			return 0;
+ 		}
+ 
+ 		/*
+ 		 * This sequence count validates that the inode matches
+ 		 * the dentry name information from lookup.
+ 		 */
+ 		*inode = d_backing_inode(dentry);
+ 		negative = d_is_negative(dentry);
+ 		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
+ 			return -ECHILD;
+ 
+ 		/*
+ 		 * This sequence count validates that the parent had no
+ 		 * changes while we did the lookup of the dentry above.
+ 		 *
+ 		 * The memory barrier in read_seqcount_begin of child is
+ 		 *  enough, we can use __read_seqcount_retry here.
+ 		 */
+ 		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
+ 			return -ECHILD;
+ 
+ 		*seqp = seq;
+ 		status = d_revalidate(dentry, nd->flags);
+ 		if (likely(status > 0)) {
+ 			/*
+ 			 * Note: do negative dentry check after revalidation in
+ 			 * case that drops it.
+ 			 */
+ 			if (unlikely(negative))
+ 				return -ENOENT;
+ 			path->mnt = mnt;
+ 			path->dentry = dentry;
+ 			if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+ 				return 1;
+ 		}
+ 		if (unlazy_child(nd, dentry, seq))
+ 			return -ECHILD;
+ 		if (unlikely(status == -ECHILD))
+ 			/* we'd been told to redo it in non-rcu mode */
+ 			status = d_revalidate(dentry, nd->flags);
+ 	} else {
+ 		dentry = __d_lookup(parent, &nd->last);
+ 		if (unlikely(!dentry))
+ 			return 0;
+ 		status = d_revalidate(dentry, nd->flags);
+ 	}
+ 	if (unlikely(status <= 0)) {
+ 		if (!status)
+ 			d_invalidate(dentry);
+ 		dput(dentry);
+ 		return status;
+ 	}
+ 	if (unlikely(d_is_negative(dentry))) {
+ 		dput(dentry);
+ 		return -ENOENT;
+ 	}
+ 
+ 	path->mnt = mnt;
+ 	path->dentry = dentry;
+ 	err = follow_managed(path, nd);
+ 	if (likely(err > 0))
+ 		*inode = d_backing_inode(path->dentry);
+ 	return err;
+ }
+ 
+ /* Fast lookup failed, do it the slow way */
+ static struct dentry *__lookup_slow(const struct qstr *name,
+ 				    struct dentry *dir,
+ 				    unsigned int flags)
+ {
+ 	struct dentry *dentry, *old;
+ 	struct inode *inode = dir->d_inode;
+ 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ 
+ 	/* Don't go there if it's already dead */
+ 	if (unlikely(IS_DEADDIR(inode)))
+ 		return ERR_PTR(-ENOENT);
+ again:
+ 	dentry = d_alloc_parallel(dir, name, &wq);
+ 	if (IS_ERR(dentry))
+ 		return dentry;
+ 	if (unlikely(!d_in_lookup(dentry))) {
+ 		if (!(flags & LOOKUP_NO_REVAL)) {
+ 			int error = d_revalidate(dentry, flags);
+ 			if (unlikely(error <= 0)) {
+ 				if (!error) {
+ 					d_invalidate(dentry);
+ 					dput(dentry);
+ 					goto again;
+ 				}
+ 				dput(dentry);
+ 				dentry = ERR_PTR(error);
+ 			}
+ 		}
+ 	} else {
+ 		old = inode->i_op->lookup(inode, dentry, flags);
+ 		d_lookup_done(dentry);
+ 		if (unlikely(old)) {
+ 			dput(dentry);
+ 			dentry = old;
+ 		}
+ 	}
+ 	return dentry;
+ }
+ 
+ static struct dentry *lookup_slow(const struct qstr *name,
+ 				  struct dentry *dir,
+ 				  unsigned int flags)
+ {
+ 	struct inode *inode = dir->d_inode;
+ 	struct dentry *res;
+ 	inode_lock_shared(inode);
+ 	res = __lookup_slow(name, dir, flags);
+ 	inode_unlock_shared(inode);
+ 	return res;
+ }
+ 
+ static inline int may_lookup(struct nameidata *nd)
+ {
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ 		if (err != -ECHILD)
+ 			return err;
+ 		if (unlazy_walk(nd))
+ 			return -ECHILD;
+ 	}
+ 	return inode_permission(nd->inode, MAY_EXEC);
+ }
+ 
+ static inline int handle_dots(struct nameidata *nd, int type)
+ {
+ 	if (type == LAST_DOTDOT) {
+ 		if (!nd->root.mnt)
+ 			set_root(nd);
+ 		if (nd->flags & LOOKUP_RCU) {
+ 			return follow_dotdot_rcu(nd);
+ 		} else
+ 			return follow_dotdot(nd);
+ 	}
+ 	return 0;
+ }
+ 
+ static int pick_link(struct nameidata *nd, struct path *link,
+ 		     struct inode *inode, unsigned seq)
+ {
+ 	int error;
+ 	struct saved *last;
+ 	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
+ 		path_to_nameidata(link, nd);
+ 		return -ELOOP;
+ 	}
+ 	if (!(nd->flags & LOOKUP_RCU)) {
+ 		if (link->mnt == nd->path.mnt)
+ 			mntget(link->mnt);
+ 	}
+ 	error = nd_alloc_stack(nd);
+ 	if (unlikely(error)) {
+ 		if (error == -ECHILD) {
+ 			if (unlikely(!legitimize_path(nd, link, seq))) {
+ 				drop_links(nd);
+ 				nd->depth = 0;
+ 				nd->flags &= ~LOOKUP_RCU;
+ 				nd->path.mnt = NULL;
+ 				nd->path.dentry = NULL;
+ 				rcu_read_unlock();
+ 			} else if (likely(unlazy_walk(nd)) == 0)
+ 				error = nd_alloc_stack(nd);
+ 		}
+ 		if (error) {
+ 			path_put(link);
+ 			return error;
+ 		}
+ 	}
+ 
+ 	last = nd->stack + nd->depth++;
+ 	last->link = *link;
+ 	clear_delayed_call(&last->done);
+ 	nd->link_inode = inode;
+ 	last->seq = seq;
+ 	return 1;
+ }
+ 
+ enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+ 
+ /*
+  * Do we need to follow links? We _really_ want to be able
+  * to do this check without having to look at inode->i_op,
+  * so we keep a cache of "no, this doesn't need follow_link"
+  * for the common case.
+  */
+ static inline int step_into(struct nameidata *nd, struct path *path,
+ 			    int flags, struct inode *inode, unsigned seq)
+ {
+ 	if (!(flags & WALK_MORE) && nd->depth)
+ 		put_link(nd);
+ 	if (likely(!d_is_symlink(path->dentry)) ||
+ 	   !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+ 		/* not a symlink or should not follow */
+ 		path_to_nameidata(path, nd);
+ 		nd->inode = inode;
+ 		nd->seq = seq;
+ 		return 0;
+ 	}
+ 	/* make sure that d_is_symlink above matches inode */
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		if (read_seqcount_retry(&path->dentry->d_seq, seq))
+ 			return -ECHILD;
+ 	}
+ 	return pick_link(nd, path, inode, seq);
+ }
+ 
+ static int walk_component(struct nameidata *nd, int flags)
+ {
+ 	struct path path;
+ 	struct inode *inode;
+ 	unsigned seq;
+ 	int err;
+ 	/*
+ 	 * "." and ".." are special - ".." especially so because it has
+ 	 * to be able to know about the current root directory and
+ 	 * parent relationships.
+ 	 */
+ 	if (unlikely(nd->last_type != LAST_NORM)) {
+ 		err = handle_dots(nd, nd->last_type);
+ 		if (!(flags & WALK_MORE) && nd->depth)
+ 			put_link(nd);
+ 		return err;
+ 	}
+ 	err = lookup_fast(nd, &path, &inode, &seq);
+ 	if (unlikely(err <= 0)) {
+ 		if (err < 0)
+ 			return err;
+ 		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
+ 					  nd->flags);
+ 		if (IS_ERR(path.dentry))
+ 			return PTR_ERR(path.dentry);
+ 
+ 		path.mnt = nd->path.mnt;
+ 		err = follow_managed(&path, nd);
+ 		if (unlikely(err < 0))
+ 			return err;
+ 
+ 		if (unlikely(d_is_negative(path.dentry))) {
+ 			path_to_nameidata(&path, nd);
+ 			return -ENOENT;
+ 		}
+ 
+ 		seq = 0;	/* we are already out of RCU mode */
+ 		inode = d_backing_inode(path.dentry);
+ 	}
+ 
+ 	return step_into(nd, &path, flags, inode, seq);
+ }
+ 
+ /*
+  * We can do the critical dentry name comparison and hashing
+  * operations one word at a time, but we are limited to:
+  *
+  * - Architectures with fast unaligned word accesses. We could
+  *   do a "get_unaligned()" if this helps and is sufficiently
+  *   fast.
+  *
+  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+  *   do not trap on the (extremely unlikely) case of a page
+  *   crossing operation.
+  *
+  * - Furthermore, we need an efficient 64-bit compile for the
+  *   64-bit case in order to generate the "number of bytes in
+  *   the final mask". Again, that could be replaced with a
+  *   efficient population count instruction or similar.
+  */
+ #ifdef CONFIG_DCACHE_WORD_ACCESS
+ 
+ #include <asm/word-at-a-time.h>
+ 
+ #ifdef HASH_MIX
+ 
+ /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
+ 
+ #elif defined(CONFIG_64BIT)
+ /*
+  * Register pressure in the mixing function is an issue, particularly
+  * on 32-bit x86, but almost any function requires one state value and
+  * one temporary.  Instead, use a function designed for two state values
+  * and no temporaries.
+  *
+  * This function cannot create a collision in only two iterations, so
+  * we have two iterations to achieve avalanche.  In those two iterations,
+  * we have six layers of mixing, which is enough to spread one bit's
+  * influence out to 2^6 = 64 state bits.
+  *
+  * Rotate constants are scored by considering either 64 one-bit input
+  * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
+  * probability of that delta causing a change to each of the 128 output
+  * bits, using a sample of random initial states.
+  *
+  * The Shannon entropy of the computed probabilities is then summed
+  * to produce a score.  Ideally, any input change has a 50% chance of
+  * toggling any given output bit.
+  *
+  * Mixing scores (in bits) for (12,45):
+  * Input delta: 1-bit      2-bit
+  * 1 round:     713.3    42542.6
+  * 2 rounds:   2753.7   140389.8
+  * 3 rounds:   5954.1   233458.2
+  * 4 rounds:   7862.6   256672.2
+  * Perfect:    8192     258048
+  *            (64*128) (64*63/2 * 128)
+  */
+ #define HASH_MIX(x, y, a)	\
+ 	(	x ^= (a),	\
+ 	y ^= x,	x = rol64(x,12),\
+ 	x += y,	y = rol64(y,45),\
+ 	y *= 9			)
+ 
+ /*
+  * Fold two longs into one 32-bit hash value.  This must be fast, but
+  * latency isn't quite as critical, as there is a fair bit of additional
+  * work done before the hash value is used.
+  */
+ static inline unsigned int fold_hash(unsigned long x, unsigned long y)
+ {
+ 	y ^= x * GOLDEN_RATIO_64;
+ 	y *= GOLDEN_RATIO_64;
+ 	return y >> 32;
+ }
+ 
+ #else	/* 32-bit case */
+ 
+ /*
+  * Mixing scores (in bits) for (7,20):
+  * Input delta: 1-bit      2-bit
+  * 1 round:     330.3     9201.6
+  * 2 rounds:   1246.4    25475.4
+  * 3 rounds:   1907.1    31295.1
+  * 4 rounds:   2042.3    31718.6
+  * Perfect:    2048      31744
+  *            (32*64)   (32*31/2 * 64)
+  */
+ #define HASH_MIX(x, y, a)	\
+ 	(	x ^= (a),	\
+ 	y ^= x,	x = rol32(x, 7),\
+ 	x += y,	y = rol32(y,20),\
+ 	y *= 9			)
+ 
+ static inline unsigned int fold_hash(unsigned long x, unsigned long y)
+ {
+ 	/* Use arch-optimized multiply if one exists */
+ 	return __hash_32(y ^ __hash_32(x));
+ }
+ 
+ #endif
+ 
+ /*
+  * Return the hash of a string of known length.  This is carfully
+  * designed to match hash_name(), which is the more critical function.
+  * In particular, we must end by hashing a final word containing 0..7
+  * payload bytes, to match the way that hash_name() iterates until it
+  * finds the delimiter after the name.
+  */
+ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
+ {
+ 	unsigned long a, x = 0, y = (unsigned long)salt;
+ 
+ 	for (;;) {
+ 		if (!len)
+ 			goto done;
+ 		a = load_unaligned_zeropad(name);
+ 		if (len < sizeof(unsigned long))
+ 			break;
+ 		HASH_MIX(x, y, a);
+ 		name += sizeof(unsigned long);
+ 		len -= sizeof(unsigned long);
+ 	}
+ 	x ^= a & bytemask_from_count(len);
+ done:
+ 	return fold_hash(x, y);
+ }
+ EXPORT_SYMBOL(full_name_hash);
+ 
+ /* Return the "hash_len" (hash and length) of a null-terminated string */
+ u64 hashlen_string(const void *salt, const char *name)
+ {
+ 	unsigned long a = 0, x = 0, y = (unsigned long)salt;
+ 	unsigned long adata, mask, len;
+ 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+ 
+ 	len = 0;
+ 	goto inside;
+ 
+ 	do {
+ 		HASH_MIX(x, y, a);
+ 		len += sizeof(unsigned long);
+ inside:
+ 		a = load_unaligned_zeropad(name+len);
+ 	} while (!has_zero(a, &adata, &constants));
+ 
+ 	adata = prep_zero_mask(a, adata, &constants);
+ 	mask = create_zero_mask(adata);
+ 	x ^= a & zero_bytemask(mask);
+ 
+ 	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+ }
+ EXPORT_SYMBOL(hashlen_string);
+ 
+ /*
+  * Calculate the length and hash of the path component, and
+  * return the "hash_len" as the result.
+  */
+ static inline u64 hash_name(const void *salt, const char *name)
+ {
+ 	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+ 	unsigned long adata, bdata, mask, len;
+ 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
+ 
+ 	len = 0;
+ 	goto inside;
+ 
+ 	do {
+ 		HASH_MIX(x, y, a);
+ 		len += sizeof(unsigned long);
+ inside:
+ 		a = load_unaligned_zeropad(name+len);
+ 		b = a ^ REPEAT_BYTE('/');
+ 	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
+ 
+ 	adata = prep_zero_mask(a, adata, &constants);
+ 	bdata = prep_zero_mask(b, bdata, &constants);
+ 	mask = create_zero_mask(adata | bdata);
+ 	x ^= a & zero_bytemask(mask);
+ 
+ 	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+ }
+ 
+ #else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
+ 
+ /* Return the hash of a string of known length */
+ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
+ {
+ 	unsigned long hash = init_name_hash(salt);
+ 	while (len--)
+ 		hash = partial_name_hash((unsigned char)*name++, hash);
+ 	return end_name_hash(hash);
+ }
+ EXPORT_SYMBOL(full_name_hash);
+ 
+ /* Return the "hash_len" (hash and length) of a null-terminated string */
+ u64 hashlen_string(const void *salt, const char *name)
+ {
+ 	unsigned long hash = init_name_hash(salt);
+ 	unsigned long len = 0, c;
+ 
+ 	c = (unsigned char)*name;
+ 	while (c) {
+ 		len++;
+ 		hash = partial_name_hash(c, hash);
+ 		c = (unsigned char)name[len];
+ 	}
+ 	return hashlen_create(end_name_hash(hash), len);
+ }
+ EXPORT_SYMBOL(hashlen_string);
+ 
+ /*
+  * We know there's a real path component here of at least
+  * one character.
+  */
+ static inline u64 hash_name(const void *salt, const char *name)
+ {
+ 	unsigned long hash = init_name_hash(salt);
+ 	unsigned long len = 0, c;
+ 
+ 	c = (unsigned char)*name;
+ 	do {
+ 		len++;
+ 		hash = partial_name_hash(c, hash);
+ 		c = (unsigned char)name[len];
+ 	} while (c && c != '/');
+ 	return hashlen_create(end_name_hash(hash), len);
+ }
+ 
+ #endif
+ 
+ /*
+  * Name resolution.
+  * This is the basic name resolution function, turning a pathname into
+  * the final dentry. We expect 'base' to be positive and a directory.
+  *
+  * Returns 0 and nd will have valid dentry and mnt on success.
+  * Returns error and drops reference to input namei data on failure.
+  */
+ static int link_path_walk(const char *name, struct nameidata *nd)
+ {
+ 	int err;
+ 
+ 	if (IS_ERR(name))
+ 		return PTR_ERR(name);
+ 	while (*name=='/')
+ 		name++;
+ 	if (!*name)
+ 		return 0;
+ 
+ 	/* At this point we know we have a real path component. */
+ 	for(;;) {
+ 		u64 hash_len;
+ 		int type;
+ 
+ 		err = may_lookup(nd);
+ 		if (err)
+ 			return err;
+ 
+ 		hash_len = hash_name(nd->path.dentry, name);
+ 
+ 		type = LAST_NORM;
+ 		if (name[0] == '.') switch (hashlen_len(hash_len)) {
+ 			case 2:
+ 				if (name[1] == '.') {
+ 					type = LAST_DOTDOT;
+ 					nd->flags |= LOOKUP_JUMPED;
+ 				}
+ 				break;
+ 			case 1:
+ 				type = LAST_DOT;
+ 		}
+ 		if (likely(type == LAST_NORM)) {
+ 			struct dentry *parent = nd->path.dentry;
+ 			nd->flags &= ~LOOKUP_JUMPED;
+ 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+ 				struct qstr this = { { .hash_len = hash_len }, .name = name };
+ 				err = parent->d_op->d_hash(parent, &this);
+ 				if (err < 0)
+ 					return err;
+ 				hash_len = this.hash_len;
+ 				name = this.name;
+ 			}
+ 		}
+ 
+ 		nd->last.hash_len = hash_len;
+ 		nd->last.name = name;
+ 		nd->last_type = type;
+ 
+ 		name += hashlen_len(hash_len);
+ 		if (!*name)
+ 			goto OK;
+ 		/*
+ 		 * If it wasn't NUL, we know it was '/'. Skip that
+ 		 * slash, and continue until no more slashes.
+ 		 */
+ 		do {
+ 			name++;
+ 		} while (unlikely(*name == '/'));
+ 		if (unlikely(!*name)) {
+ OK:
+ 			/* pathname body, done */
+ 			if (!nd->depth)
+ 				return 0;
+ 			name = nd->stack[nd->depth - 1].name;
+ 			/* trailing symlink, done */
+ 			if (!name)
+ 				return 0;
+ 			/* last component of nested symlink */
+ 			err = walk_component(nd, WALK_FOLLOW);
+ 		} else {
+ 			/* not the last component */
+ 			err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
+ 		}
+ 		if (err < 0)
+ 			return err;
+ 
+ 		if (err) {
+ 			const char *s = get_link(nd);
+ 
+ 			if (IS_ERR(s))
+ 				return PTR_ERR(s);
+ 			err = 0;
+ 			if (unlikely(!s)) {
+ 				/* jumped */
+ 				put_link(nd);
+ 			} else {
+ 				nd->stack[nd->depth - 1].name = name;
+ 				name = s;
+ 				continue;
+ 			}
+ 		}
+ 		if (unlikely(!d_can_lookup(nd->path.dentry))) {
+ 			if (nd->flags & LOOKUP_RCU) {
+ 				if (unlazy_walk(nd))
+ 					return -ECHILD;
+ 			}
+ 			return -ENOTDIR;
+ 		}
+ 	}
+ }
+ 
+ /* must be paired with terminate_walk() */
+ static const char *path_init(struct nameidata *nd, unsigned flags)
+ {
+ 	const char *s = nd->name->name;
+ 
+ 	if (!*s)
+ 		flags &= ~LOOKUP_RCU;
+ 	if (flags & LOOKUP_RCU)
+ 		rcu_read_lock();
+ 
+ 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
+ 	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
+ 	nd->depth = 0;
+ 	if (flags & LOOKUP_ROOT) {
+ 		struct dentry *root = nd->root.dentry;
+ 		struct inode *inode = root->d_inode;
+ 		if (*s && unlikely(!d_can_lookup(root)))
+ 			return ERR_PTR(-ENOTDIR);
+ 		nd->path = nd->root;
+ 		nd->inode = inode;
+ 		if (flags & LOOKUP_RCU) {
+ 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+ 			nd->root_seq = nd->seq;
+ 			nd->m_seq = read_seqbegin(&mount_lock);
+ 		} else {
+ 			path_get(&nd->path);
+ 		}
+ 		return s;
+ 	}
+ 
+ 	nd->root.mnt = NULL;
+ 	nd->path.mnt = NULL;
+ 	nd->path.dentry = NULL;
+ 
+ 	nd->m_seq = read_seqbegin(&mount_lock);
+ 	if (*s == '/') {
+ 		set_root(nd);
+ 		if (likely(!nd_jump_root(nd)))
+ 			return s;
+ 		return ERR_PTR(-ECHILD);
+ 	} else if (nd->dfd == AT_FDCWD) {
+ 		if (flags & LOOKUP_RCU) {
+ 			struct fs_struct *fs = current->fs;
+ 			unsigned seq;
+ 
+ 			do {
+ 				seq = read_seqcount_begin(&fs->seq);
+ 				nd->path = fs->pwd;
+ 				nd->inode = nd->path.dentry->d_inode;
+ 				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+ 			} while (read_seqcount_retry(&fs->seq, seq));
+ 		} else {
+ 			get_fs_pwd(current->fs, &nd->path);
+ 			nd->inode = nd->path.dentry->d_inode;
+ 		}
+ 		return s;
+ 	} else {
+ 		/* Caller must check execute permissions on the starting path component */
+ 		struct fd f = fdget_raw(nd->dfd);
+ 		struct dentry *dentry;
+ 
+ 		if (!f.file)
+ 			return ERR_PTR(-EBADF);
+ 
+ 		dentry = f.file->f_path.dentry;
+ 
+ 		if (*s && unlikely(!d_can_lookup(dentry))) {
+ 			fdput(f);
+ 			return ERR_PTR(-ENOTDIR);
+ 		}
+ 
+ 		nd->path = f.file->f_path;
+ 		if (flags & LOOKUP_RCU) {
+ 			nd->inode = nd->path.dentry->d_inode;
+ 			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+ 		} else {
+ 			path_get(&nd->path);
+ 			nd->inode = nd->path.dentry->d_inode;
+ 		}
+ 		fdput(f);
+ 		return s;
+ 	}
+ }
+ 
+ static const char *trailing_symlink(struct nameidata *nd)
+ {
+ 	const char *s;
+ 	int error = may_follow_link(nd);
+ 	if (unlikely(error))
+ 		return ERR_PTR(error);
+ 	nd->flags |= LOOKUP_PARENT;
+ 	nd->stack[0].name = NULL;
+ 	s = get_link(nd);
+ 	return s ? s : "";
+ }
+ 
+ static inline int lookup_last(struct nameidata *nd)
+ {
+ 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+ 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+ 
+ 	nd->flags &= ~LOOKUP_PARENT;
+ 	return walk_component(nd, 0);
+ }
+ 
+ static int handle_lookup_down(struct nameidata *nd)
+ {
+ 	struct path path = nd->path;
+ 	struct inode *inode = nd->inode;
+ 	unsigned seq = nd->seq;
+ 	int err;
+ 
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		/*
+ 		 * don't bother with unlazy_walk on failure - we are
+ 		 * at the very beginning of walk, so we lose nothing
+ 		 * if we simply redo everything in non-RCU mode
+ 		 */
+ 		if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
+ 			return -ECHILD;
+ 	} else {
+ 		dget(path.dentry);
+ 		err = follow_managed(&path, nd);
+ 		if (unlikely(err < 0))
+ 			return err;
+ 		inode = d_backing_inode(path.dentry);
+ 		seq = 0;
+ 	}
+ 	path_to_nameidata(&path, nd);
+ 	nd->inode = inode;
+ 	nd->seq = seq;
+ 	return 0;
+ }
+ 
+ /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
+ {
+ 	const char *s = path_init(nd, flags);
+ 	int err;
+ 
+ 	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
+ 		err = handle_lookup_down(nd);
+ 		if (unlikely(err < 0))
+ 			s = ERR_PTR(err);
+ 	}
+ 
+ 	while (!(err = link_path_walk(s, nd))
+ 		&& ((err = lookup_last(nd)) > 0)) {
+ 		s = trailing_symlink(nd);
+ 	}
+ 	if (!err)
+ 		err = complete_walk(nd);
+ 
+ 	if (!err && nd->flags & LOOKUP_DIRECTORY)
+ 		if (!d_can_lookup(nd->path.dentry))
+ 			err = -ENOTDIR;
+ 	if (!err) {
+ 		*path = nd->path;
+ 		nd->path.mnt = NULL;
+ 		nd->path.dentry = NULL;
+ 	}
+ 	terminate_walk(nd);
+ 	return err;
+ }
+ 
+ int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ 		    struct path *path, struct path *root)
+ {
+ 	int retval;
+ 	struct nameidata nd;
+ 	if (IS_ERR(name))
+ 		return PTR_ERR(name);
+ 	if (unlikely(root)) {
+ 		nd.root = *root;
+ 		flags |= LOOKUP_ROOT;
+ 	}
+ 	set_nameidata(&nd, dfd, name);
+ 	retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
+ 	if (unlikely(retval == -ECHILD))
+ 		retval = path_lookupat(&nd, flags, path);
+ 	if (unlikely(retval == -ESTALE))
+ 		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
+ 
+ 	if (likely(!retval))
+ 		audit_inode(name, path->dentry, 0);
+ 	restore_nameidata();
+ 	putname(name);
+ 	return retval;
+ }
+ 
+ /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+ static int path_parentat(struct nameidata *nd, unsigned flags,
+ 				struct path *parent)
+ {
+ 	const char *s = path_init(nd, flags);
+ 	int err = link_path_walk(s, nd);
+ 	if (!err)
+ 		err = complete_walk(nd);
+ 	if (!err) {
+ 		*parent = nd->path;
+ 		nd->path.mnt = NULL;
+ 		nd->path.dentry = NULL;
+ 	}
+ 	terminate_walk(nd);
+ 	return err;
+ }
+ 
+ static struct filename *filename_parentat(int dfd, struct filename *name,
+ 				unsigned int flags, struct path *parent,
+ 				struct qstr *last, int *type)
+ {
+ 	int retval;
+ 	struct nameidata nd;
+ 
+ 	if (IS_ERR(name))
+ 		return name;
+ 	set_nameidata(&nd, dfd, name);
+ 	retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
+ 	if (unlikely(retval == -ECHILD))
+ 		retval = path_parentat(&nd, flags, parent);
+ 	if (unlikely(retval == -ESTALE))
+ 		retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
+ 	if (likely(!retval)) {
+ 		*last = nd.last;
+ 		*type = nd.last_type;
+ 		audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+ 	} else {
+ 		putname(name);
+ 		name = ERR_PTR(retval);
+ 	}
+ 	restore_nameidata();
+ 	return name;
+ }
+ 
+ /* does lookup, returns the object with parent locked */
+ struct dentry *kern_path_locked(const char *name, struct path *path)
+ {
+ 	struct filename *filename;
+ 	struct dentry *d;
+ 	struct qstr last;
+ 	int type;
+ 
+ 	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+ 				    &last, &type);
+ 	if (IS_ERR(filename))
+ 		return ERR_CAST(filename);
+ 	if (unlikely(type != LAST_NORM)) {
+ 		path_put(path);
+ 		putname(filename);
+ 		return ERR_PTR(-EINVAL);
+ 	}
+ 	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
+ 	d = __lookup_hash(&last, path->dentry, 0);
+ 	if (IS_ERR(d)) {
+ 		inode_unlock(path->dentry->d_inode);
+ 		path_put(path);
+ 	}
+ 	putname(filename);
+ 	return d;
+ }
+ 
+ int kern_path(const char *name, unsigned int flags, struct path *path)
+ {
+ 	return filename_lookup(AT_FDCWD, getname_kernel(name),
+ 			       flags, path, NULL);
+ }
+ EXPORT_SYMBOL(kern_path);
+ 
+ /**
+  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
+  * @dentry:  pointer to dentry of the base directory
+  * @mnt: pointer to vfs mount of the base directory
+  * @name: pointer to file name
+  * @flags: lookup flags
+  * @path: pointer to struct path to fill
+  */
+ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
+ 		    const char *name, unsigned int flags,
+ 		    struct path *path)
+ {
+ 	struct path root = {.mnt = mnt, .dentry = dentry};
+ 	/* the first argument of filename_lookup() is ignored with root */
+ 	return filename_lookup(AT_FDCWD, getname_kernel(name),
+ 			       flags , path, &root);
+ }
+ EXPORT_SYMBOL(vfs_path_lookup);
+ 
+ static int lookup_one_len_common(const char *name, struct dentry *base,
+ 				 int len, struct qstr *this)
+ {
+ 	this->name = name;
+ 	this->len = len;
+ 	this->hash = full_name_hash(base, name, len);
+ 	if (!len)
+ 		return -EACCES;
+ 
+ 	if (unlikely(name[0] == '.')) {
+ 		if (len < 2 || (len == 2 && name[1] == '.'))
+ 			return -EACCES;
+ 	}
+ 
+ 	while (len--) {
+ 		unsigned int c = *(const unsigned char *)name++;
+ 		if (c == '/' || c == '\0')
+ 			return -EACCES;
+ 	}
+ 	/*
+ 	 * See if the low-level filesystem might want
+ 	 * to use its own hash..
+ 	 */
+ 	if (base->d_flags & DCACHE_OP_HASH) {
+ 		int err = base->d_op->d_hash(base, this);
+ 		if (err < 0)
+ 			return err;
+ 	}
+ 
+ 	return inode_permission(base->d_inode, MAY_EXEC);
+ }
+ 
+ /**
+  * try_lookup_one_len - filesystem helper to lookup single pathname component
+  * @name:	pathname component to lookup
+  * @base:	base directory to lookup from
+  * @len:	maximum length @len should be interpreted to
+  *
+  * Look up a dentry by name in the dcache, returning NULL if it does not
+  * currently exist.  The function does not try to create a dentry.
+  *
+  * Note that this routine is purely a helper for filesystem usage and should
+  * not be called by generic code.
+  *
+  * The caller must hold base->i_mutex.
+  */
+ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
+ {
+ 	struct qstr this;
+ 	int err;
+ 
+ 	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+ 
+ 	err = lookup_one_len_common(name, base, len, &this);
+ 	if (err)
+ 		return ERR_PTR(err);
+ 
+ 	return lookup_dcache(&this, base, 0);
+ }
+ EXPORT_SYMBOL(try_lookup_one_len);
+ 
+ /**
+  * lookup_one_len - filesystem helper to lookup single pathname component
+  * @name:	pathname component to lookup
+  * @base:	base directory to lookup from
+  * @len:	maximum length @len should be interpreted to
+  *
+  * Note that this routine is purely a helper for filesystem usage and should
+  * not be called by generic code.
+  *
+  * The caller must hold base->i_mutex.
+  */
+ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+ {
+ 	struct dentry *dentry;
+ 	struct qstr this;
+ 	int err;
+ 
+ 	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
+ 
+ 	err = lookup_one_len_common(name, base, len, &this);
+ 	if (err)
+ 		return ERR_PTR(err);
+ 
+ 	dentry = lookup_dcache(&this, base, 0);
+ 	return dentry ? dentry : __lookup_slow(&this, base, 0);
+ }
+ EXPORT_SYMBOL(lookup_one_len);
+ 
+ /**
+  * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+  * @name:	pathname component to lookup
+  * @base:	base directory to lookup from
+  * @len:	maximum length @len should be interpreted to
+  *
+  * Note that this routine is purely a helper for filesystem usage and should
+  * not be called by generic code.
+  *
+  * Unlike lookup_one_len, it should be called without the parent
+  * i_mutex held, and will take the i_mutex itself if necessary.
+  */
+ struct dentry *lookup_one_len_unlocked(const char *name,
+ 				       struct dentry *base, int len)
+ {
+ 	struct qstr this;
+ 	int err;
+ 	struct dentry *ret;
+ 
+ 	err = lookup_one_len_common(name, base, len, &this);
+ 	if (err)
+ 		return ERR_PTR(err);
+ 
+ 	ret = lookup_dcache(&this, base, 0);
+ 	if (!ret)
+ 		ret = lookup_slow(&this, base, 0);
+ 	return ret;
+ }
+ EXPORT_SYMBOL(lookup_one_len_unlocked);
+ 
+ #ifdef CONFIG_UNIX98_PTYS
+ int path_pts(struct path *path)
+ {
+ 	/* Find something mounted on "pts" in the same directory as
+ 	 * the input path.
+ 	 */
+ 	struct dentry *child, *parent;
+ 	struct qstr this;
+ 	int ret;
+ 
+ 	ret = path_parent_directory(path);
+ 	if (ret)
+ 		return ret;
+ 
+ 	parent = path->dentry;
+ 	this.name = "pts";
+ 	this.len = 3;
+ 	child = d_hash_and_lookup(parent, &this);
+ 	if (!child)
+ 		return -ENOENT;
+ 
+ 	path->dentry = child;
+ 	dput(parent);
+ 	follow_mount(path);
+ 	return 0;
+ }
+ #endif
+ 
+ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+ 		 struct path *path, int *empty)
+ {
+ 	return filename_lookup(dfd, getname_flags(name, flags, empty),
+ 			       flags, path, NULL);
+ }
+ EXPORT_SYMBOL(user_path_at_empty);
+ 
+ /**
+  * mountpoint_last - look up last component for umount
+  * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
+  *
+  * This is a special lookup_last function just for umount. In this case, we
+  * need to resolve the path without doing any revalidation.
+  *
+  * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+  * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+  * in almost all cases, this lookup will be served out of the dcache. The only
+  * cases where it won't are if nd->last refers to a symlink or the path is
+  * bogus and it doesn't exist.
+  *
+  * Returns:
+  * -error: if there was an error during lookup. This includes -ENOENT if the
+  *         lookup found a negative dentry.
+  *
+  * 0:      if we successfully resolved nd->last and found it to not to be a
+  *         symlink that needs to be followed.
+  *
+  * 1:      if we successfully resolved nd->last and found it to be a symlink
+  *         that needs to be followed.
+  */
+ static int
+ mountpoint_last(struct nameidata *nd)
+ {
+ 	int error = 0;
+ 	struct dentry *dir = nd->path.dentry;
+ 	struct path path;
+ 
+ 	/* If we're in rcuwalk, drop out of it to handle last component */
+ 	if (nd->flags & LOOKUP_RCU) {
+ 		if (unlazy_walk(nd))
+ 			return -ECHILD;
+ 	}
+ 
+ 	nd->flags &= ~LOOKUP_PARENT;
+ 
+ 	if (unlikely(nd->last_type != LAST_NORM)) {
+ 		error = handle_dots(nd, nd->last_type);
+ 		if (error)
+ 			return error;
+ 		path.dentry = dget(nd->path.dentry);
+ 	} else {
+ 		path.dentry = d_lookup(dir, &nd->last);
+ 		if (!path.dentry) {
+ 			/*
+ 			 * No cached dentry. Mounted dentries are pinned in the
+ 			 * cache, so that means that this dentry is probably
+ 			 * a symlink or the path doesn't actually point
+ 			 * to a mounted dentry.
+ 			 */
+ 			path.dentry = lookup_slow(&nd->last, dir,
+ 					     nd->flags | LOOKUP_NO_REVAL);
+ 			if (IS_ERR(path.dentry))
+ 				return PTR_ERR(path.dentry);
+ 		}
+ 	}
+ 	if (d_is_negative(path.dentry)) {
+ 		dput(path.dentry);
+ 		return -ENOENT;
+ 	}
+ 	path.mnt = nd->path.mnt;
+ 	return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
+ }
+ 
+ /**
+  * path_mountpoint - look up a path to be umounted
+  * @nd:		lookup context
+  * @flags:	lookup flags
+  * @path:	pointer to container for result
+  *
+  * Look up the given name, but don't attempt to revalidate the last component.
+  * Returns 0 and "path" will be valid on success; Returns error otherwise.
+  */
+ static int
+ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
+ {
+ 	const char *s = path_init(nd, flags);
+ 	int err;
+ 
+ 	while (!(err = link_path_walk(s, nd)) &&
+ 		(err = mountpoint_last(nd)) > 0) {
+ 		s = trailing_symlink(nd);
+ 	}
+ 	if (!err) {
+ 		*path = nd->path;
+ 		nd->path.mnt = NULL;
+ 		nd->path.dentry = NULL;
+ 		follow_mount(path);
+ 	}
+ 	terminate_walk(nd);
+ 	return err;
+ }
+ 
+ static int
+ filename_mountpoint(int dfd, struct filename *name, struct path *path,
+ 			unsigned int flags)
+ {
+ 	struct nameidata nd;
+ 	int error;
+ 	if (IS_ERR(name))
+ 		return PTR_ERR(name);
+ 	set_nameidata(&nd, dfd, name);
+ 	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
+ 	if (unlikely(error == -ECHILD))
+ 		error = path_mountpoint(&nd, flags, path);
+ 	if (unlikely(error == -ESTALE))
+ 		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
+ 	if (likely(!error))
+ 		audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
+ 	restore_nameidata();
+ 	putname(name);
+ 	return error;
+ }
+ 
+ /**
+  * user_path_mountpoint_at - lookup a path from userland in order to umount it
+  * @dfd:	directory file descriptor
+  * @name:	pathname from userland
+  * @flags:	lookup flags
+  * @path:	pointer to container to hold result
+  *
+  * A umount is a special case for path walking. We're not actually interested
+  * in the inode in this situation, and ESTALE errors can be a problem. We
+  * simply want track down the dentry and vfsmount attached at the mountpoint
+  * and avoid revalidating the last component.
+  *
+  * Returns 0 and populates "path" on success.
+  */
+ int
+ user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
+ 			struct path *path)
+ {
+ 	return filename_mountpoint(dfd, getname(name), path, flags);
+ }
+ 
+ int
+ kern_path_mountpoint(int dfd, const char *name, struct path *path,
+ 			unsigned int flags)
+ {
+ 	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
+ }
+ EXPORT_SYMBOL(kern_path_mountpoint);
+ 
+ int __check_sticky(struct inode *dir, struct inode *inode)
+ {
+ 	kuid_t fsuid = current_fsuid();
+ 
+ 	if (uid_eq(inode->i_uid, fsuid))
+ 		return 0;
+ 	if (uid_eq(dir->i_uid, fsuid))
+ 		return 0;
+ 	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
+ }
+ EXPORT_SYMBOL(__check_sticky);
+ 
+ /*
+  *	Check whether we can remove a link victim from directory dir, check
+  *  whether the type of victim is right.
+  *  1. We can't do it if dir is read-only (done in permission())
+  *  2. We should have write and exec permissions on dir
+  *  3. We can't remove anything from append-only dir
+  *  4. We can't do anything with immutable dir (done in permission())
+  *  5. If the sticky bit on dir is set we should either
+  *	a. be owner of dir, or
+  *	b. be owner of victim, or
+  *	c. have CAP_FOWNER capability
+  *  6. If the victim is append-only or immutable we can't do antyhing with
+  *     links pointing to it.
+  *  7. If the victim has an unknown uid or gid we can't change the inode.
+  *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+  *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+  * 10. We can't remove a root or mountpoint.
+  * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
+  *     nfs_async_unlink().
+  */
+ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+ {
+ 	struct inode *inode = d_backing_inode(victim);
+ 	int error;
+ 
+ 	if (d_is_negative(victim))
+ 		return -ENOENT;
+ 	BUG_ON(!inode);
+ 
+ 	BUG_ON(victim->d_parent->d_inode != dir);
+ 
+ 	/* Inode writeback is not safe when the uid or gid are invalid. */
+ 	if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ 		return -EOVERFLOW;
+ 
+ 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
+ 
+ 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ 	if (error)
+ 		return error;
+ 	if (IS_APPEND(dir))
+ 		return -EPERM;
+ 
+ 	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
+ 	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
+ 		return -EPERM;
+ 	if (isdir) {
+ 		if (!d_is_dir(victim))
+ 			return -ENOTDIR;
+ 		if (IS_ROOT(victim))
+ 			return -EBUSY;
+ 	} else if (d_is_dir(victim))
+ 		return -EISDIR;
+ 	if (IS_DEADDIR(dir))
+ 		return -ENOENT;
+ 	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+ 		return -EBUSY;
+ 	return 0;
+ }
+ 
+ /*	Check whether we can create an object with dentry child in directory
+  *  dir.
+  *  1. We can't do it if child already exists (open has special treatment for
+  *     this case, but since we are inlined it's OK)
+  *  2. We can't do it if dir is read-only (done in permission())
+  *  3. We can't do it if the fs can't represent the fsuid or fsgid.
+  *  4. We should have write and exec permissions on dir
+  *  5. We can't do it if dir is immutable (done in permission())
+  */
+ static inline int may_create(struct inode *dir, struct dentry *child)
+ {
+ 	struct user_namespace *s_user_ns;
+ 	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
+ 	if (child->d_inode)
+ 		return -EEXIST;
+ 	if (IS_DEADDIR(dir))
+ 		return -ENOENT;
+ 	s_user_ns = dir->i_sb->s_user_ns;
+ 	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ 	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+ 		return -EOVERFLOW;
+ 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ }
+ 
+ /*
+  * p1 and p2 should be directories on the same fs.
+  */
+ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
+ {
+ 	struct dentry *p;
+ 
+ 	if (p1 == p2) {
+ 		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ 		return NULL;
+ 	}
+ 
+ 	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+ 
+ 	p = d_ancestor(p2, p1);
+ 	if (p) {
+ 		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ 		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+ 		return p;
+ 	}
+ 
+ 	p = d_ancestor(p1, p2);
+ 	if (p) {
+ 		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ 		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
+ 		return p;
+ 	}
+ 
+ 	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ 	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ 	return NULL;
+ }
+ EXPORT_SYMBOL(lock_rename);
+ 
+ void unlock_rename(struct dentry *p1, struct dentry *p2)
+ {
+ 	inode_unlock(p1->d_inode);
+ 	if (p1 != p2) {
+ 		inode_unlock(p2->d_inode);
+ 		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ 	}
+ }
+ EXPORT_SYMBOL(unlock_rename);
+ 
+ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ 		bool want_excl)
+ {
+ 	int error = may_create(dir, dentry);
+ 	if (error)
+ 		return error;
+ 
+ 	if (!dir->i_op->create)
+ 		return -EACCES;	/* shouldn't it be ENOSYS? */
+ 	mode &= S_IALLUGO;
+ 	mode |= S_IFREG;
+ 	error = security_inode_create(dir, dentry, mode);
+ 	if (error)
+ 		return error;
+ 	error = dir->i_op->create(dir, dentry, mode, want_excl);
+ 	if (!error)
+ 		fsnotify_create(dir, dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_create);
+ 
+ int vfs_mkobj(struct dentry *dentry, umode_t mode,
+ 		int (*f)(struct dentry *, umode_t, void *),
+ 		void *arg)
+ {
+ 	struct inode *dir = dentry->d_parent->d_inode;
+ 	int error = may_create(dir, dentry);
+ 	if (error)
+ 		return error;
+ 
+ 	mode &= S_IALLUGO;
+ 	mode |= S_IFREG;
+ 	error = security_inode_create(dir, dentry, mode);
+ 	if (error)
+ 		return error;
+ 	error = f(dentry, mode, arg);
+ 	if (!error)
+ 		fsnotify_create(dir, dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_mkobj);
+ 
+ bool may_open_dev(const struct path *path)
+ {
+ 	return !(path->mnt->mnt_flags & MNT_NODEV) &&
+ 		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+ }
+ 
+ static int may_open(const struct path *path, int acc_mode, int flag)
+ {
+ 	struct dentry *dentry = path->dentry;
+ 	struct inode *inode = dentry->d_inode;
+ 	int error;
+ 
+ 	if (!inode)
+ 		return -ENOENT;
+ 
+ 	switch (inode->i_mode & S_IFMT) {
+ 	case S_IFLNK:
+ 		return -ELOOP;
+ 	case S_IFDIR:
+ 		if (acc_mode & MAY_WRITE)
+ 			return -EISDIR;
+ 		break;
+ 	case S_IFBLK:
+ 	case S_IFCHR:
+ 		if (!may_open_dev(path))
+ 			return -EACCES;
+ 		/*FALLTHRU*/
+ 	case S_IFIFO:
+ 	case S_IFSOCK:
+ 		flag &= ~O_TRUNC;
+ 		break;
+ 	}
+ 
+ 	error = inode_permission(inode, MAY_OPEN | acc_mode);
+ 	if (error)
+ 		return error;
+ 
+ 	/*
+ 	 * An append-only file must be opened in append mode for writing.
+ 	 */
+ 	if (IS_APPEND(inode)) {
+ 		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
+ 			return -EPERM;
+ 		if (flag & O_TRUNC)
+ 			return -EPERM;
+ 	}
+ 
+ 	/* O_NOATIME can only be set by the owner or superuser */
+ 	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+ 		return -EPERM;
+ 
+ 	return 0;
+ }
+ 
+ static int handle_truncate(struct file *filp)
+ {
+ 	const struct path *path = &filp->f_path;
+ 	struct inode *inode = path->dentry->d_inode;
+ 	int error = get_write_access(inode);
+ 	if (error)
+ 		return error;
+ 	/*
+ 	 * Refuse to truncate files with mandatory locks held on them.
+ 	 */
+ 	error = locks_verify_locked(filp);
+ 	if (!error)
+ 		error = security_path_truncate(path);
+ 	if (!error) {
+ 		error = do_truncate(path->dentry, 0,
+ 				    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
+ 				    filp);
+ 	}
+ 	put_write_access(inode);
+ 	return error;
+ }
+ 
+ static inline int open_to_namei_flags(int flag)
+ {
+ 	if ((flag & O_ACCMODE) == 3)
+ 		flag--;
+ 	return flag;
+ }
+ 
+ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
+ {
+ 	struct user_namespace *s_user_ns;
+ 	int error = security_path_mknod(dir, dentry, mode, 0);
+ 	if (error)
+ 		return error;
+ 
+ 	s_user_ns = dir->dentry->d_sb->s_user_ns;
+ 	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ 	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+ 		return -EOVERFLOW;
+ 
+ 	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+ 	if (error)
+ 		return error;
+ 
+ 	return security_inode_create(dir->dentry->d_inode, dentry, mode);
+ }
+ 
+ /*
+  * Attempt to atomically look up, create and open a file from a negative
+  * dentry.
+  *
+  * Returns 0 if successful.  The file will have been created and attached to
+  * @file by the filesystem calling finish_open().
+  *
+  * If the file was looked up only or didn't need creating, FMODE_OPENED won't
+  * be set.  The caller will need to perform the open themselves.  @path will
+  * have been updated to point to the new dentry.  This may be negative.
+  *
+  * Returns an error code otherwise.
+  */
+ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
+ 			struct path *path, struct file *file,
+ 			const struct open_flags *op,
+ 			int open_flag, umode_t mode)
+ {
+ 	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
+ 	struct inode *dir =  nd->path.dentry->d_inode;
+ 	int error;
+ 
+ 	if (!(~open_flag & (O_EXCL | O_CREAT)))	/* both O_EXCL and O_CREAT */
+ 		open_flag &= ~O_TRUNC;
+ 
+ 	if (nd->flags & LOOKUP_DIRECTORY)
+ 		open_flag |= O_DIRECTORY;
+ 
+ 	file->f_path.dentry = DENTRY_NOT_SET;
+ 	file->f_path.mnt = nd->path.mnt;
+ 	error = dir->i_op->atomic_open(dir, dentry, file,
+ 				       open_to_namei_flags(open_flag), mode);
+ 	d_lookup_done(dentry);
+ 	if (!error) {
+ 		if (file->f_mode & FMODE_OPENED) {
+ 			/*
+ 			 * We didn't have the inode before the open, so check open
+ 			 * permission here.
+ 			 */
+ 			int acc_mode = op->acc_mode;
+ 			if (file->f_mode & FMODE_CREATED) {
+ 				WARN_ON(!(open_flag & O_CREAT));
+ 				fsnotify_create(dir, dentry);
+ 				acc_mode = 0;
+ 			}
+ 			error = may_open(&file->f_path, acc_mode, open_flag);
+ 			if (WARN_ON(error > 0))
+ 				error = -EINVAL;
+ 		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
+ 			error = -EIO;
+ 		} else {
+ 			if (file->f_path.dentry) {
+ 				dput(dentry);
+ 				dentry = file->f_path.dentry;
+ 			}
+ 			if (file->f_mode & FMODE_CREATED)
+ 				fsnotify_create(dir, dentry);
+ 			if (unlikely(d_is_negative(dentry))) {
+ 				error = -ENOENT;
+ 			} else {
+ 				path->dentry = dentry;
+ 				path->mnt = nd->path.mnt;
+ 				return 0;
+ 			}
+ 		}
+ 	}
+ 	dput(dentry);
+ 	return error;
+ }
+ 
+ /*
+  * Look up and maybe create and open the last component.
+  *
+  * Must be called with parent locked (exclusive in O_CREAT case).
+  *
+  * Returns 0 on success, that is, if
+  *  the file was successfully atomically created (if necessary) and opened, or
+  *  the file was not completely opened at this time, though lookups and
+  *  creations were performed.
+  * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
+  * In the latter case dentry returned in @path might be negative if O_CREAT
+  * hadn't been specified.
+  *
+  * An error code is returned on failure.
+  */
+ static int lookup_open(struct nameidata *nd, struct path *path,
+ 			struct file *file,
+ 			const struct open_flags *op,
+ 			bool got_write)
+ {
+ 	struct dentry *dir = nd->path.dentry;
+ 	struct inode *dir_inode = dir->d_inode;
+ 	int open_flag = op->open_flag;
+ 	struct dentry *dentry;
+ 	int error, create_error = 0;
+ 	umode_t mode = op->mode;
+ 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ 
+ 	if (unlikely(IS_DEADDIR(dir_inode)))
+ 		return -ENOENT;
+ 
+ 	file->f_mode &= ~FMODE_CREATED;
+ 	dentry = d_lookup(dir, &nd->last);
+ 	for (;;) {
+ 		if (!dentry) {
+ 			dentry = d_alloc_parallel(dir, &nd->last, &wq);
+ 			if (IS_ERR(dentry))
+ 				return PTR_ERR(dentry);
+ 		}
+ 		if (d_in_lookup(dentry))
+ 			break;
+ 
+ 		error = d_revalidate(dentry, nd->flags);
+ 		if (likely(error > 0))
+ 			break;
+ 		if (error)
+ 			goto out_dput;
+ 		d_invalidate(dentry);
+ 		dput(dentry);
+ 		dentry = NULL;
+ 	}
+ 	if (dentry->d_inode) {
+ 		/* Cached positive dentry: will open in f_op->open */
+ 		goto out_no_open;
+ 	}
+ 
+ 	/*
+ 	 * Checking write permission is tricky, bacuse we don't know if we are
+ 	 * going to actually need it: O_CREAT opens should work as long as the
+ 	 * file exists.  But checking existence breaks atomicity.  The trick is
+ 	 * to check access and if not granted clear O_CREAT from the flags.
+ 	 *
+ 	 * Another problem is returing the "right" error value (e.g. for an
+ 	 * O_EXCL open we want to return EEXIST not EROFS).
+ 	 */
+ 	if (open_flag & O_CREAT) {
+ 		if (!IS_POSIXACL(dir->d_inode))
+ 			mode &= ~current_umask();
+ 		if (unlikely(!got_write)) {
+ 			create_error = -EROFS;
+ 			open_flag &= ~O_CREAT;
+ 			if (open_flag & (O_EXCL | O_TRUNC))
+ 				goto no_open;
+ 			/* No side effects, safe to clear O_CREAT */
+ 		} else {
+ 			create_error = may_o_create(&nd->path, dentry, mode);
+ 			if (create_error) {
+ 				open_flag &= ~O_CREAT;
+ 				if (open_flag & O_EXCL)
+ 					goto no_open;
+ 			}
+ 		}
+ 	} else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
+ 		   unlikely(!got_write)) {
+ 		/*
+ 		 * No O_CREATE -> atomicity not a requirement -> fall
+ 		 * back to lookup + open
+ 		 */
+ 		goto no_open;
+ 	}
+ 
+ 	if (dir_inode->i_op->atomic_open) {
+ 		error = atomic_open(nd, dentry, path, file, op, open_flag,
+ 				    mode);
+ 		if (unlikely(error == -ENOENT) && create_error)
+ 			error = create_error;
+ 		return error;
+ 	}
+ 
+ no_open:
+ 	if (d_in_lookup(dentry)) {
+ 		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
+ 							     nd->flags);
+ 		d_lookup_done(dentry);
+ 		if (unlikely(res)) {
+ 			if (IS_ERR(res)) {
+ 				error = PTR_ERR(res);
+ 				goto out_dput;
+ 			}
+ 			dput(dentry);
+ 			dentry = res;
+ 		}
+ 	}
+ 
+ 	/* Negative dentry, just create the file */
+ 	if (!dentry->d_inode && (open_flag & O_CREAT)) {
+ 		file->f_mode |= FMODE_CREATED;
+ 		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+ 		if (!dir_inode->i_op->create) {
+ 			error = -EACCES;
+ 			goto out_dput;
+ 		}
+ 		error = dir_inode->i_op->create(dir_inode, dentry, mode,
+ 						open_flag & O_EXCL);
+ 		if (error)
+ 			goto out_dput;
+ 		fsnotify_create(dir_inode, dentry);
+ 	}
+ 	if (unlikely(create_error) && !dentry->d_inode) {
+ 		error = create_error;
+ 		goto out_dput;
+ 	}
+ out_no_open:
+ 	path->dentry = dentry;
+ 	path->mnt = nd->path.mnt;
+ 	return 0;
+ 
+ out_dput:
+ 	dput(dentry);
+ 	return error;
+ }
+ 
+ /*
+  * Handle the last step of open()
+  */
+ static int do_last(struct nameidata *nd,
+ 		   struct file *file, const struct open_flags *op)
+ {
+ 	struct dentry *dir = nd->path.dentry;
+ 	kuid_t dir_uid = nd->inode->i_uid;
+ 	umode_t dir_mode = nd->inode->i_mode;
+ 	int open_flag = op->open_flag;
+ 	bool will_truncate = (open_flag & O_TRUNC) != 0;
+ 	bool got_write = false;
+ 	int acc_mode = op->acc_mode;
+ 	unsigned seq;
+ 	struct inode *inode;
+ 	struct path path;
+ 	int error;
+ 
+ 	nd->flags &= ~LOOKUP_PARENT;
+ 	nd->flags |= op->intent;
+ 
+ 	if (nd->last_type != LAST_NORM) {
+ 		error = handle_dots(nd, nd->last_type);
+ 		if (unlikely(error))
+ 			return error;
+ 		goto finish_open;
+ 	}
+ 
+ 	if (!(open_flag & O_CREAT)) {
+ 		if (nd->last.name[nd->last.len])
+ 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+ 		/* we _can_ be in RCU mode here */
+ 		error = lookup_fast(nd, &path, &inode, &seq);
+ 		if (likely(error > 0))
+ 			goto finish_lookup;
+ 
+ 		if (error < 0)
+ 			return error;
+ 
+ 		BUG_ON(nd->inode != dir->d_inode);
+ 		BUG_ON(nd->flags & LOOKUP_RCU);
+ 	} else {
+ 		/* create side of things */
+ 		/*
+ 		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
+ 		 * has been cleared when we got to the last component we are
+ 		 * about to look up
+ 		 */
+ 		error = complete_walk(nd);
+ 		if (error)
+ 			return error;
+ 
+ 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+ 		/* trailing slashes? */
+ 		if (unlikely(nd->last.name[nd->last.len]))
+ 			return -EISDIR;
+ 	}
+ 
+ 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+ 		error = mnt_want_write(nd->path.mnt);
+ 		if (!error)
+ 			got_write = true;
+ 		/*
+ 		 * do _not_ fail yet - we might not need that or fail with
+ 		 * a different error; let lookup_open() decide; we'll be
+ 		 * dropping this one anyway.
+ 		 */
+ 	}
+ 	if (open_flag & O_CREAT)
+ 		inode_lock(dir->d_inode);
+ 	else
+ 		inode_lock_shared(dir->d_inode);
+ 	error = lookup_open(nd, &path, file, op, got_write);
+ 	if (open_flag & O_CREAT)
+ 		inode_unlock(dir->d_inode);
+ 	else
+ 		inode_unlock_shared(dir->d_inode);
+ 
+ 	if (error)
+ 		goto out;
+ 
+ 	if (file->f_mode & FMODE_OPENED) {
+ 		if ((file->f_mode & FMODE_CREATED) ||
+ 		    !S_ISREG(file_inode(file)->i_mode))
+ 			will_truncate = false;
+ 
+ 		audit_inode(nd->name, file->f_path.dentry, 0);
+ 		goto opened;
+ 	}
+ 
+ 	if (file->f_mode & FMODE_CREATED) {
+ 		/* Don't check for write permission, don't truncate */
+ 		open_flag &= ~O_TRUNC;
+ 		will_truncate = false;
+ 		acc_mode = 0;
+ 		path_to_nameidata(&path, nd);
+ 		goto finish_open_created;
+ 	}
+ 
+ 	/*
+ 	 * If atomic_open() acquired write access it is dropped now due to
+ 	 * possible mount and symlink following (this might be optimized away if
+ 	 * necessary...)
+ 	 */
+ 	if (got_write) {
+ 		mnt_drop_write(nd->path.mnt);
+ 		got_write = false;
+ 	}
+ 
+ 	error = follow_managed(&path, nd);
+ 	if (unlikely(error < 0))
+ 		return error;
+ 
+ 	if (unlikely(d_is_negative(path.dentry))) {
+ 		path_to_nameidata(&path, nd);
+ 		return -ENOENT;
+ 	}
+ 
+ 	/*
+ 	 * create/update audit record if it already exists.
+ 	 */
+ 	audit_inode(nd->name, path.dentry, 0);
+ 
+ 	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
+ 		path_to_nameidata(&path, nd);
+ 		return -EEXIST;
+ 	}
+ 
+ 	seq = 0;	/* out of RCU mode, so the value doesn't matter */
+ 	inode = d_backing_inode(path.dentry);
+ finish_lookup:
+ 	error = step_into(nd, &path, 0, inode, seq);
+ 	if (unlikely(error))
+ 		return error;
+ finish_open:
+ 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+ 	error = complete_walk(nd);
+ 	if (error)
+ 		return error;
+ 	audit_inode(nd->name, nd->path.dentry, 0);
+ 	if (open_flag & O_CREAT) {
+ 		error = -EISDIR;
+ 		if (d_is_dir(nd->path.dentry))
+ 			goto out;
+ 		error = may_create_in_sticky(dir_mode, dir_uid,
+ 					     d_backing_inode(nd->path.dentry));
+ 		if (unlikely(error))
+ 			goto out;
+ 	}
+ 	error = -ENOTDIR;
+ 	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
+ 		goto out;
+ 	if (!d_is_reg(nd->path.dentry))
+ 		will_truncate = false;
+ 
+ 	if (will_truncate) {
+ 		error = mnt_want_write(nd->path.mnt);
+ 		if (error)
+ 			goto out;
+ 		got_write = true;
+ 	}
+ finish_open_created:
+ 	error = may_open(&nd->path, acc_mode, open_flag);
+ 	if (error)
+ 		goto out;
+ 	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
+ 	error = vfs_open(&nd->path, file);
+ 	if (error)
+ 		goto out;
+ opened:
+ 	error = ima_file_check(file, op->acc_mode);
+ 	if (!error && will_truncate)
+ 		error = handle_truncate(file);
+ out:
+ 	if (unlikely(error > 0)) {
+ 		WARN_ON(1);
+ 		error = -EINVAL;
+ 	}
+ 	if (got_write)
+ 		mnt_drop_write(nd->path.mnt);
+ 	return error;
+ }
+ 
+ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+ {
+ 	struct dentry *child = NULL;
+ 	struct inode *dir = dentry->d_inode;
+ 	struct inode *inode;
+ 	int error;
+ 
+ 	/* we want directory to be writable */
+ 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ 	if (error)
+ 		goto out_err;
+ 	error = -EOPNOTSUPP;
+ 	if (!dir->i_op->tmpfile)
+ 		goto out_err;
+ 	error = -ENOMEM;
+ 	child = d_alloc(dentry, &slash_name);
+ 	if (unlikely(!child))
+ 		goto out_err;
+ 	error = dir->i_op->tmpfile(dir, child, mode);
+ 	if (error)
+ 		goto out_err;
+ 	error = -ENOENT;
+ 	inode = child->d_inode;
+ 	if (unlikely(!inode))
+ 		goto out_err;
+ 	if (!(open_flag & O_EXCL)) {
+ 		spin_lock(&inode->i_lock);
+ 		inode->i_state |= I_LINKABLE;
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ 	ima_post_create_tmpfile(inode);
+ 	return child;
+ 
+ out_err:
+ 	dput(child);
+ 	return ERR_PTR(error);
+ }
+ EXPORT_SYMBOL(vfs_tmpfile);
+ 
+ static int do_tmpfile(struct nameidata *nd, unsigned flags,
+ 		const struct open_flags *op,
+ 		struct file *file)
+ {
+ 	struct dentry *child;
+ 	struct path path;
+ 	int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
+ 	if (unlikely(error))
+ 		return error;
+ 	error = mnt_want_write(path.mnt);
+ 	if (unlikely(error))
+ 		goto out;
+ 	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+ 	error = PTR_ERR(child);
+ 	if (IS_ERR(child))
+ 		goto out2;
+ 	dput(path.dentry);
+ 	path.dentry = child;
+ 	audit_inode(nd->name, child, 0);
+ 	/* Don't check for other permissions, the inode was just created */
+ 	error = may_open(&path, 0, op->open_flag);
+ 	if (error)
+ 		goto out2;
+ 	file->f_path.mnt = path.mnt;
+ 	error = finish_open(file, child, NULL);
+ out2:
+ 	mnt_drop_write(path.mnt);
+ out:
+ 	path_put(&path);
+ 	return error;
+ }
+ 
+ static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
+ {
+ 	struct path path;
+ 	int error = path_lookupat(nd, flags, &path);
+ 	if (!error) {
+ 		audit_inode(nd->name, path.dentry, 0);
+ 		error = vfs_open(&path, file);
+ 		path_put(&path);
+ 	}
+ 	return error;
+ }
+ 
+ static struct file *path_openat(struct nameidata *nd,
+ 			const struct open_flags *op, unsigned flags)
+ {
+ 	struct file *file;
+ 	int error;
+ 
+ 	file = alloc_empty_file(op->open_flag, current_cred());
+ 	if (IS_ERR(file))
+ 		return file;
+ 
+ 	if (unlikely(file->f_flags & __O_TMPFILE)) {
+ 		error = do_tmpfile(nd, flags, op, file);
+ 	} else if (unlikely(file->f_flags & O_PATH)) {
+ 		error = do_o_path(nd, flags, file);
+ 	} else {
+ 		const char *s = path_init(nd, flags);
+ 		while (!(error = link_path_walk(s, nd)) &&
+ 			(error = do_last(nd, file, op)) > 0) {
+ 			nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+ 			s = trailing_symlink(nd);
+ 		}
+ 		terminate_walk(nd);
+ 	}
+ 	if (likely(!error)) {
+ 		if (likely(file->f_mode & FMODE_OPENED))
+ 			return file;
+ 		WARN_ON(1);
+ 		error = -EINVAL;
+ 	}
+ 	fput(file);
+ 	if (error == -EOPENSTALE) {
+ 		if (flags & LOOKUP_RCU)
+ 			error = -ECHILD;
+ 		else
+ 			error = -ESTALE;
+ 	}
+ 	return ERR_PTR(error);
+ }
+ 
+ struct file *do_filp_open(int dfd, struct filename *pathname,
+ 		const struct open_flags *op)
+ {
+ 	struct nameidata nd;
+ 	int flags = op->lookup_flags;
+ 	struct file *filp;
+ 
+ 	set_nameidata(&nd, dfd, pathname);
+ 	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
+ 	if (unlikely(filp == ERR_PTR(-ECHILD)))
+ 		filp = path_openat(&nd, op, flags);
+ 	if (unlikely(filp == ERR_PTR(-ESTALE)))
+ 		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
+ 	restore_nameidata();
+ 	return filp;
+ }
+ 
+ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+ 		const char *name, const struct open_flags *op)
+ {
+ 	struct nameidata nd;
+ 	struct file *file;
+ 	struct filename *filename;
+ 	int flags = op->lookup_flags | LOOKUP_ROOT;
+ 
+ 	nd.root.mnt = mnt;
+ 	nd.root.dentry = dentry;
+ 
+ 	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
+ 		return ERR_PTR(-ELOOP);
+ 
+ 	filename = getname_kernel(name);
+ 	if (IS_ERR(filename))
+ 		return ERR_CAST(filename);
+ 
+ 	set_nameidata(&nd, -1, filename);
+ 	file = path_openat(&nd, op, flags | LOOKUP_RCU);
+ 	if (unlikely(file == ERR_PTR(-ECHILD)))
+ 		file = path_openat(&nd, op, flags);
+ 	if (unlikely(file == ERR_PTR(-ESTALE)))
+ 		file = path_openat(&nd, op, flags | LOOKUP_REVAL);
+ 	restore_nameidata();
+ 	putname(filename);
+ 	return file;
+ }
+ 
+ static struct dentry *filename_create(int dfd, struct filename *name,
+ 				struct path *path, unsigned int lookup_flags)
+ {
+ 	struct dentry *dentry = ERR_PTR(-EEXIST);
+ 	struct qstr last;
+ 	int type;
+ 	int err2;
+ 	int error;
+ 	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
+ 
+ 	/*
+ 	 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
+ 	 * other flags passed in are ignored!
+ 	 */
+ 	lookup_flags &= LOOKUP_REVAL;
+ 
+ 	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ 	if (IS_ERR(name))
+ 		return ERR_CAST(name);
+ 
+ 	/*
+ 	 * Yucky last component or no last component at all?
+ 	 * (foo/., foo/.., /////)
+ 	 */
+ 	if (unlikely(type != LAST_NORM))
+ 		goto out;
+ 
+ 	/* don't fail immediately if it's r/o, at least try to report other errors */
+ 	err2 = mnt_want_write(path->mnt);
+ 	/*
+ 	 * Do the final lookup.
+ 	 */
+ 	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ 	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
+ 	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
+ 	if (IS_ERR(dentry))
+ 		goto unlock;
+ 
+ 	error = -EEXIST;
+ 	if (d_is_positive(dentry))
+ 		goto fail;
+ 
+ 	/*
+ 	 * Special case - lookup gave negative, but... we had foo/bar/
+ 	 * From the vfs_mknod() POV we just have a negative dentry -
+ 	 * all is fine. Let's be bastards - you had / on the end, you've
+ 	 * been asking for (non-existent) directory. -ENOENT for you.
+ 	 */
+ 	if (unlikely(!is_dir && last.name[last.len])) {
+ 		error = -ENOENT;
+ 		goto fail;
+ 	}
+ 	if (unlikely(err2)) {
+ 		error = err2;
+ 		goto fail;
+ 	}
+ 	putname(name);
+ 	return dentry;
+ fail:
+ 	dput(dentry);
+ 	dentry = ERR_PTR(error);
+ unlock:
+ 	inode_unlock(path->dentry->d_inode);
+ 	if (!err2)
+ 		mnt_drop_write(path->mnt);
+ out:
+ 	path_put(path);
+ 	putname(name);
+ 	return dentry;
+ }
+ 
+ struct dentry *kern_path_create(int dfd, const char *pathname,
+ 				struct path *path, unsigned int lookup_flags)
+ {
+ 	return filename_create(dfd, getname_kernel(pathname),
+ 				path, lookup_flags);
+ }
+ EXPORT_SYMBOL(kern_path_create);
+ 
+ void done_path_create(struct path *path, struct dentry *dentry)
+ {
+ 	dput(dentry);
+ 	inode_unlock(path->dentry->d_inode);
+ 	mnt_drop_write(path->mnt);
+ 	path_put(path);
+ }
+ EXPORT_SYMBOL(done_path_create);
+ 
+ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
+ 				struct path *path, unsigned int lookup_flags)
+ {
+ 	return filename_create(dfd, getname(pathname), path, lookup_flags);
+ }
+ EXPORT_SYMBOL(user_path_create);
+ 
+ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ {
+ 	int error = may_create(dir, dentry);
+ 
+ 	if (error)
+ 		return error;
+ 
+ 	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+ 		return -EPERM;
+ 
+ 	if (!dir->i_op->mknod)
+ 		return -EPERM;
+ 
+ 	error = devcgroup_inode_mknod(mode, dev);
+ 	if (error)
+ 		return error;
+ 
+ 	error = security_inode_mknod(dir, dentry, mode, dev);
+ 	if (error)
+ 		return error;
+ 
+ 	error = dir->i_op->mknod(dir, dentry, mode, dev);
+ 	if (!error)
+ 		fsnotify_create(dir, dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_mknod);
+ 
+ static int may_mknod(umode_t mode)
+ {
+ 	switch (mode & S_IFMT) {
+ 	case S_IFREG:
+ 	case S_IFCHR:
+ 	case S_IFBLK:
+ 	case S_IFIFO:
+ 	case S_IFSOCK:
+ 	case 0: /* zero mode translates to S_IFREG */
+ 		return 0;
+ 	case S_IFDIR:
+ 		return -EPERM;
+ 	default:
+ 		return -EINVAL;
+ 	}
+ }
+ 
+ long do_mknodat(int dfd, const char __user *filename, umode_t mode,
+ 		unsigned int dev)
+ {
+ 	struct dentry *dentry;
+ 	struct path path;
+ 	int error;
+ 	unsigned int lookup_flags = 0;
+ 
+ 	error = may_mknod(mode);
+ 	if (error)
+ 		return error;
+ retry:
+ 	dentry = user_path_create(dfd, filename, &path, lookup_flags);
+ 	if (IS_ERR(dentry))
+ 		return PTR_ERR(dentry);
+ 
+ 	if (!IS_POSIXACL(path.dentry->d_inode))
+ 		mode &= ~current_umask();
+ 
+ 	if (gr_handle_chroot_mknod(dentry, path.mnt, mode)) {
+ 		error = -EPERM;
+ 		goto out;
+ 	}
+ 
+ 	error = security_path_mknod(&path, dentry, mode, dev);
+ 	if (error)
+ 		goto out;
+ 	switch (mode & S_IFMT) {
+ 		case 0: case S_IFREG:
+ 			error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+ 			if (!error)
+ 				ima_post_path_mknod(dentry);
+ 			break;
+ 		case S_IFCHR: case S_IFBLK:
+ 			error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+ 					new_decode_dev(dev));
+ 			break;
+ 		case S_IFIFO: case S_IFSOCK:
+ 			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
+ 			break;
+ 	}
+ out:
+ 	done_path_create(&path, dentry);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
+ 		unsigned int, dev)
+ {
+ 	return do_mknodat(dfd, filename, mode, dev);
+ }
+ 
+ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
+ {
+ 	return do_mknodat(AT_FDCWD, filename, mode, dev);
+ }
+ 
+ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ {
+ 	int error = may_create(dir, dentry);
+ 	unsigned max_links = dir->i_sb->s_max_links;
+ 
+ 	if (error)
+ 		return error;
+ 
+ 	if (!dir->i_op->mkdir)
+ 		return -EPERM;
+ 
+ 	mode &= (S_IRWXUGO|S_ISVTX);
+ 	error = security_inode_mkdir(dir, dentry, mode);
+ 	if (error)
+ 		return error;
+ 
+ 	if (max_links && dir->i_nlink >= max_links)
+ 		return -EMLINK;
+ 
+ 	error = dir->i_op->mkdir(dir, dentry, mode);
+ 	if (!error)
+ 		fsnotify_mkdir(dir, dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_mkdir);
+ 
+ long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
+ {
+ 	struct dentry *dentry;
+ 	struct path path;
+ 	int error;
+ 	unsigned int lookup_flags = LOOKUP_DIRECTORY;
+ 
+ retry:
+ 	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
+ 	if (IS_ERR(dentry))
+ 		return PTR_ERR(dentry);
+ 
+ 	if (!IS_POSIXACL(path.dentry->d_inode))
+ 		mode &= ~current_umask();
+ 	error = security_path_mkdir(&path, dentry, mode);
+ 	if (!error)
+ 		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+ 	done_path_create(&path, dentry);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
+ {
+ 	return do_mkdirat(dfd, pathname, mode);
+ }
+ 
+ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
+ {
+ 	return do_mkdirat(AT_FDCWD, pathname, mode);
+ }
+ 
+ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+ {
+ 	int error = may_delete(dir, dentry, 1);
+ 
+ 	if (error)
+ 		return error;
+ 
+ 	if (!dir->i_op->rmdir)
+ 		return -EPERM;
+ 
+ 	dget(dentry);
+ 	inode_lock(dentry->d_inode);
+ 
+ 	error = -EBUSY;
+ 	if (is_local_mountpoint(dentry))
+ 		goto out;
+ 
+ 	error = security_inode_rmdir(dir, dentry);
+ 	if (error)
+ 		goto out;
+ 
+ 	error = dir->i_op->rmdir(dir, dentry);
+ 	if (error)
+ 		goto out;
+ 
+ 	shrink_dcache_parent(dentry);
+ 	dentry->d_inode->i_flags |= S_DEAD;
+ 	dont_mount(dentry);
+ 	detach_mounts(dentry);
+ 	fsnotify_rmdir(dir, dentry);
+ 
+ out:
+ 	inode_unlock(dentry->d_inode);
+ 	dput(dentry);
+ 	if (!error)
+ 		d_delete(dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_rmdir);
+ 
+ long do_rmdir(int dfd, const char __user *pathname)
+ {
+ 	int error = 0;
+ 	struct filename *name;
+ 	struct dentry *dentry;
+ 	struct path path;
+ 	struct qstr last;
+ 	int type;
+ 	unsigned int lookup_flags = 0;
+ retry:
+ 	name = filename_parentat(dfd, getname(pathname), lookup_flags,
+ 				&path, &last, &type);
+ 	if (IS_ERR(name))
+ 		return PTR_ERR(name);
+ 
+ 	switch (type) {
+ 	case LAST_DOTDOT:
+ 		error = -ENOTEMPTY;
+ 		goto exit1;
+ 	case LAST_DOT:
+ 		error = -EINVAL;
+ 		goto exit1;
+ 	case LAST_ROOT:
+ 		error = -EBUSY;
+ 		goto exit1;
+ 	}
+ 
+ 	error = mnt_want_write(path.mnt);
+ 	if (error)
+ 		goto exit1;
+ 
+ 	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
+ 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+ 	error = PTR_ERR(dentry);
+ 	if (IS_ERR(dentry))
+ 		goto exit2;
+ 	if (!dentry->d_inode) {
+ 		error = -ENOENT;
+ 		goto exit3;
+ 	}
+ 	error = security_path_rmdir(&path, dentry);
+ 	if (error)
+ 		goto exit3;
+ 	error = vfs_rmdir(path.dentry->d_inode, dentry);
+ exit3:
+ 	dput(dentry);
+ exit2:
+ 	inode_unlock(path.dentry->d_inode);
+ 	mnt_drop_write(path.mnt);
+ exit1:
+ 	path_put(&path);
+ 	putname(name);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
+ {
+ 	return do_rmdir(AT_FDCWD, pathname);
+ }
+ 
+ /**
+  * vfs_unlink - unlink a filesystem object
+  * @dir:	parent directory
+  * @dentry:	victim
+  * @delegated_inode: returns victim inode, if the inode is delegated.
+  *
+  * The caller must hold dir->i_mutex.
+  *
+  * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
+  * return a reference to the inode in delegated_inode.  The caller
+  * should then break the delegation on that inode and retry.  Because
+  * breaking a delegation may take a long time, the caller should drop
+  * dir->i_mutex before doing so.
+  *
+  * Alternatively, a caller may pass NULL for delegated_inode.  This may
+  * be appropriate for callers that expect the underlying filesystem not
+  * to be NFS exported.
+  */
+ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+ {
+ 	struct inode *target = dentry->d_inode;
+ 	int error = may_delete(dir, dentry, 0);
+ 
+ 	if (error)
+ 		return error;
+ 
+ 	if (!dir->i_op->unlink)
+ 		return -EPERM;
+ 
+ 	inode_lock(target);
+ 	if (is_local_mountpoint(dentry))
+ 		error = -EBUSY;
+ 	else {
+ 		error = security_inode_unlink(dir, dentry);
+ 		if (!error) {
+ 			error = try_break_deleg(target, delegated_inode);
+ 			if (error)
+ 				goto out;
+ 			error = dir->i_op->unlink(dir, dentry);
+ 			if (!error) {
+ 				dont_mount(dentry);
+ 				detach_mounts(dentry);
+ 				fsnotify_unlink(dir, dentry);
+ 			}
+ 		}
+ 	}
+ out:
+ 	inode_unlock(target);
+ 
+ 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
+ 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+ 		fsnotify_link_count(target);
+ 		d_delete(dentry);
+ 	}
+ 
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_unlink);
+ 
+ /*
+  * Make sure that the actual truncation of the file will occur outside its
+  * directory's i_mutex.  Truncate can take a long time if there is a lot of
+  * writeout happening, and we don't want to prevent access to the directory
+  * while waiting on the I/O.
+  */
+ long do_unlinkat(int dfd, struct filename *name)
+ {
+ 	int error;
+ 	struct dentry *dentry;
+ 	struct path path;
+ 	struct qstr last;
+ 	int type;
+ 	struct inode *inode = NULL;
+ 	struct inode *delegated_inode = NULL;
+ 	unsigned int lookup_flags = 0;
+ retry:
+ 	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
+ 	if (IS_ERR(name))
+ 		return PTR_ERR(name);
+ 
+ 	error = -EISDIR;
+ 	if (type != LAST_NORM)
+ 		goto exit1;
+ 
+ 	error = mnt_want_write(path.mnt);
+ 	if (error)
+ 		goto exit1;
+ retry_deleg:
+ 	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
+ 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
+ 	error = PTR_ERR(dentry);
+ 	if (!IS_ERR(dentry)) {
+ 		/* Why not before? Because we want correct error value */
+ 		if (last.name[last.len])
+ 			goto slashes;
+ 		inode = dentry->d_inode;
+ 		if (d_is_negative(dentry))
+ 			goto slashes;
+ 		ihold(inode);
+ 		error = security_path_unlink(&path, dentry);
+ 		if (error)
+ 			goto exit2;
+ 		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+ exit2:
+ 		dput(dentry);
+ 	}
+ 	inode_unlock(path.dentry->d_inode);
+ 	if (inode)
+ 		iput(inode);	/* truncate the inode here */
+ 	inode = NULL;
+ 	if (delegated_inode) {
+ 		error = break_deleg_wait(&delegated_inode);
+ 		if (!error)
+ 			goto retry_deleg;
+ 	}
+ 	mnt_drop_write(path.mnt);
+ exit1:
+ 	path_put(&path);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		inode = NULL;
+ 		goto retry;
+ 	}
+ 	putname(name);
+ 	return error;
+ 
+ slashes:
+ 	if (d_is_negative(dentry))
+ 		error = -ENOENT;
+ 	else if (d_is_dir(dentry))
+ 		error = -EISDIR;
+ 	else
+ 		error = -ENOTDIR;
+ 	goto exit2;
+ }
+ 
+ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
+ {
+ 	if ((flag & ~AT_REMOVEDIR) != 0)
+ 		return -EINVAL;
+ 
+ 	if (flag & AT_REMOVEDIR)
+ 		return do_rmdir(dfd, pathname);
+ 
+ 	return do_unlinkat(dfd, getname(pathname));
+ }
+ 
+ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
+ {
+ 	return do_unlinkat(AT_FDCWD, getname(pathname));
+ }
+ 
+ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+ {
+ 	int error = may_create(dir, dentry);
+ 
+ 	if (error)
+ 		return error;
+ 
+ 	if (!dir->i_op->symlink)
+ 		return -EPERM;
+ 
+ 	error = security_inode_symlink(dir, dentry, oldname);
+ 	if (error)
+ 		return error;
+ 
+ 	error = dir->i_op->symlink(dir, dentry, oldname);
+ 	if (!error)
+ 		fsnotify_create(dir, dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_symlink);
+ 
+ long do_symlinkat(const char __user *oldname, int newdfd,
+ 		  const char __user *newname)
+ {
+ 	int error;
+ 	struct filename *from;
+ 	struct dentry *dentry;
+ 	struct path path;
+ 	unsigned int lookup_flags = 0;
+ 
+ 	from = getname(oldname);
+ 	if (IS_ERR(from))
+ 		return PTR_ERR(from);
+ retry:
+ 	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
+ 	error = PTR_ERR(dentry);
+ 	if (IS_ERR(dentry))
+ 		goto out_putname;
+ 
+ 	error = security_path_symlink(&path, dentry, from->name);
+ 	if (!error)
+ 		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+ 	done_path_create(&path, dentry);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ out_putname:
+ 	putname(from);
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
+ 		int, newdfd, const char __user *, newname)
+ {
+ 	return do_symlinkat(oldname, newdfd, newname);
+ }
+ 
+ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
+ {
+ 	return do_symlinkat(oldname, AT_FDCWD, newname);
+ }
+ 
+ /**
+  * vfs_link - create a new link
+  * @old_dentry:	object to be linked
+  * @dir:	new parent
+  * @new_dentry:	where to create the new link
+  * @delegated_inode: returns inode needing a delegation break
+  *
+  * The caller must hold dir->i_mutex
+  *
+  * If vfs_link discovers a delegation on the to-be-linked file in need
+  * of breaking, it will return -EWOULDBLOCK and return a reference to the
+  * inode in delegated_inode.  The caller should then break the delegation
+  * and retry.  Because breaking a delegation may take a long time, the
+  * caller should drop the i_mutex before doing so.
+  *
+  * Alternatively, a caller may pass NULL for delegated_inode.  This may
+  * be appropriate for callers that expect the underlying filesystem not
+  * to be NFS exported.
+  */
+ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+ {
+ 	struct inode *inode = old_dentry->d_inode;
+ 	unsigned max_links = dir->i_sb->s_max_links;
+ 	int error;
+ 
+ 	if (!inode)
+ 		return -ENOENT;
+ 
+ 	error = may_create(dir, new_dentry);
+ 	if (error)
+ 		return error;
+ 
+ 	if (dir->i_sb != inode->i_sb)
+ 		return -EXDEV;
+ 
+ 	/*
+ 	 * A link to an append-only or immutable file cannot be created.
+ 	 */
+ 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ 		return -EPERM;
+ 	/*
+ 	 * Updating the link count will likely cause i_uid and i_gid to
+ 	 * be writen back improperly if their true value is unknown to
+ 	 * the vfs.
+ 	 */
+ 	if (HAS_UNMAPPED_ID(inode))
+ 		return -EPERM;
+ 	if (!dir->i_op->link)
+ 		return -EPERM;
+ 	if (S_ISDIR(inode->i_mode))
+ 		return -EPERM;
+ 
+ 	error = security_inode_link(old_dentry, dir, new_dentry);
+ 	if (error)
+ 		return error;
+ 
+ 	inode_lock(inode);
+ 	/* Make sure we don't allow creating hardlink to an unlinked file */
+ 	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ 		error =  -ENOENT;
+ 	else if (max_links && inode->i_nlink >= max_links)
+ 		error = -EMLINK;
+ 	else {
+ 		error = try_break_deleg(inode, delegated_inode);
+ 		if (!error)
+ 			error = dir->i_op->link(old_dentry, dir, new_dentry);
+ 	}
+ 
+ 	if (!error && (inode->i_state & I_LINKABLE)) {
+ 		spin_lock(&inode->i_lock);
+ 		inode->i_state &= ~I_LINKABLE;
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ 	inode_unlock(inode);
+ 	if (!error)
+ 		fsnotify_link(dir, inode, new_dentry);
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_link);
+ 
+ /*
+  * Hardlinks are often used in delicate situations.  We avoid
+  * security-related surprises by not following symlinks on the
+  * newname.  --KAB
+  *
+  * We don't follow them on the oldname either to be compatible
+  * with linux 2.0, and to avoid hard-linking to directories
+  * and other special files.  --ADM
+  */
+ int do_linkat(int olddfd, const char __user *oldname, int newdfd,
+ 	      const char __user *newname, int flags)
+ {
+ 	struct dentry *new_dentry;
+ 	struct path old_path, new_path;
+ 	struct inode *delegated_inode = NULL;
+ 	int how = 0;
+ 	int error;
+ 
+ 	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+ 		return -EINVAL;
+ 	/*
+ 	 * To use null names we require CAP_DAC_READ_SEARCH
+ 	 * This ensures that not everyone will be able to create
+ 	 * handlink using the passed filedescriptor.
+ 	 */
+ 	if (flags & AT_EMPTY_PATH) {
+ 		if (!capable(CAP_DAC_READ_SEARCH))
+ 			return -ENOENT;
+ 		how = LOOKUP_EMPTY;
+ 	}
+ 
+ 	if (flags & AT_SYMLINK_FOLLOW)
+ 		how |= LOOKUP_FOLLOW;
+ retry:
+ 	error = user_path_at(olddfd, oldname, how, &old_path);
+ 	if (error)
+ 		return error;
+ 
+ 	new_dentry = user_path_create(newdfd, newname, &new_path,
+ 					(how & LOOKUP_REVAL));
+ 	error = PTR_ERR(new_dentry);
+ 	if (IS_ERR(new_dentry))
+ 		goto out;
+ 
+ 	error = -EXDEV;
+ 	if (old_path.mnt != new_path.mnt)
+ 		goto out_dput;
+ 	error = may_linkat(&old_path);
+ 	if (unlikely(error))
+ 		goto out_dput;
+ 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
+ 	if (error)
+ 		goto out_dput;
+ 	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+ out_dput:
+ 	done_path_create(&new_path, new_dentry);
+ 	if (delegated_inode) {
+ 		error = break_deleg_wait(&delegated_inode);
+ 		if (!error) {
+ 			path_put(&old_path);
+ 			goto retry;
+ 		}
+ 	}
+ 	if (retry_estale(error, how)) {
+ 		path_put(&old_path);
+ 		how |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ out:
+ 	path_put(&old_path);
+ 
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
+ 		int, newdfd, const char __user *, newname, int, flags)
+ {
+ 	return do_linkat(olddfd, oldname, newdfd, newname, flags);
+ }
+ 
+ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
+ {
+ 	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ }
+ 
+ /**
+  * vfs_rename - rename a filesystem object
+  * @old_dir:	parent of source
+  * @old_dentry:	source
+  * @new_dir:	parent of destination
+  * @new_dentry:	destination
+  * @delegated_inode: returns an inode needing a delegation break
+  * @flags:	rename flags
+  *
+  * The caller must hold multiple mutexes--see lock_rename()).
+  *
+  * If vfs_rename discovers a delegation in need of breaking at either
+  * the source or destination, it will return -EWOULDBLOCK and return a
+  * reference to the inode in delegated_inode.  The caller should then
+  * break the delegation and retry.  Because breaking a delegation may
+  * take a long time, the caller should drop all locks before doing
+  * so.
+  *
+  * Alternatively, a caller may pass NULL for delegated_inode.  This may
+  * be appropriate for callers that expect the underlying filesystem not
+  * to be NFS exported.
+  *
+  * The worst of all namespace operations - renaming directory. "Perverted"
+  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
+  * Problems:
+  *
+  *	a) we can get into loop creation.
+  *	b) race potential - two innocent renames can create a loop together.
+  *	   That's where 4.4 screws up. Current fix: serialization on
+  *	   sb->s_vfs_rename_mutex. We might be more accurate, but that's another
+  *	   story.
+  *	c) we have to lock _four_ objects - parents and victim (if it exists),
+  *	   and source (if it is not a directory).
+  *	   And that - after we got ->i_mutex on parents (until then we don't know
+  *	   whether the target exists).  Solution: try to be smart with locking
+  *	   order for inodes.  We rely on the fact that tree topology may change
+  *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
+  *	   move will be locked.  Thus we can rank directories by the tree
+  *	   (ancestors first) and rank all non-directories after them.
+  *	   That works since everybody except rename does "lock parent, lookup,
+  *	   lock child" and rename is under ->s_vfs_rename_mutex.
+  *	   HOWEVER, it relies on the assumption that any object with ->lookup()
+  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
+  *	   we'd better make sure that there's no link(2) for them.
+  *	d) conversion from fhandle to dentry may come in the wrong moment - when
+  *	   we are removing the target. Solution: we will have to grab ->i_mutex
+  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
+  *	   ->i_mutex on parents, which works but leads to some truly excessive
+  *	   locking].
+  */
+ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ 	       struct inode *new_dir, struct dentry *new_dentry,
+ 	       struct inode **delegated_inode, unsigned int flags)
+ {
+ 	int error;
+ 	bool is_dir = d_is_dir(old_dentry);
+ 	struct inode *source = old_dentry->d_inode;
+ 	struct inode *target = new_dentry->d_inode;
+ 	bool new_is_dir = false;
+ 	unsigned max_links = new_dir->i_sb->s_max_links;
+ 	struct name_snapshot old_name;
+ 
+ 	if (source == target)
+ 		return 0;
+ 
+ 	error = may_delete(old_dir, old_dentry, is_dir);
+ 	if (error)
+ 		return error;
+ 
+ 	if (!target) {
+ 		error = may_create(new_dir, new_dentry);
+ 	} else {
+ 		new_is_dir = d_is_dir(new_dentry);
+ 
+ 		if (!(flags & RENAME_EXCHANGE))
+ 			error = may_delete(new_dir, new_dentry, is_dir);
+ 		else
+ 			error = may_delete(new_dir, new_dentry, new_is_dir);
+ 	}
+ 	if (error)
+ 		return error;
+ 
+ 	if (!old_dir->i_op->rename)
+ 		return -EPERM;
+ 
+ 	/*
+ 	 * If we are going to change the parent - check write permissions,
+ 	 * we'll need to flip '..'.
+ 	 */
+ 	if (new_dir != old_dir) {
+ 		if (is_dir) {
+ 			error = inode_permission(source, MAY_WRITE);
+ 			if (error)
+ 				return error;
+ 		}
+ 		if ((flags & RENAME_EXCHANGE) && new_is_dir) {
+ 			error = inode_permission(target, MAY_WRITE);
+ 			if (error)
+ 				return error;
+ 		}
+ 	}
+ 
+ 	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
+ 				      flags);
+ 	if (error)
+ 		return error;
+ 
+ 	take_dentry_name_snapshot(&old_name, old_dentry);
+ 	dget(new_dentry);
+ 	if (!is_dir || (flags & RENAME_EXCHANGE))
+ 		lock_two_nondirectories(source, target);
+ 	else if (target)
+ 		inode_lock(target);
+ 
+ 	error = -EBUSY;
+ 	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
+ 		goto out;
+ 
+ 	if (max_links && new_dir != old_dir) {
+ 		error = -EMLINK;
+ 		if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
+ 			goto out;
+ 		if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
+ 		    old_dir->i_nlink >= max_links)
+ 			goto out;
+ 	}
+ 	if (!is_dir) {
+ 		error = try_break_deleg(source, delegated_inode);
+ 		if (error)
+ 			goto out;
+ 	}
+ 	if (target && !new_is_dir) {
+ 		error = try_break_deleg(target, delegated_inode);
+ 		if (error)
+ 			goto out;
+ 	}
+ 	error = old_dir->i_op->rename(old_dir, old_dentry,
+ 				       new_dir, new_dentry, flags);
+ 	if (error)
+ 		goto out;
+ 
+ 	if (!(flags & RENAME_EXCHANGE) && target) {
+ 		if (is_dir) {
+ 			shrink_dcache_parent(new_dentry);
+ 			target->i_flags |= S_DEAD;
+ 		}
+ 		dont_mount(new_dentry);
+ 		detach_mounts(new_dentry);
+ 	}
+ 	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
+ 		if (!(flags & RENAME_EXCHANGE))
+ 			d_move(old_dentry, new_dentry);
+ 		else
+ 			d_exchange(old_dentry, new_dentry);
+ 	}
+ out:
+ 	if (!is_dir || (flags & RENAME_EXCHANGE))
+ 		unlock_two_nondirectories(source, target);
+ 	else if (target)
+ 		inode_unlock(target);
+ 	dput(new_dentry);
+ 	if (!error) {
+ 		fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
+ 			      !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
+ 		if (flags & RENAME_EXCHANGE) {
+ 			fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
+ 				      new_is_dir, NULL, new_dentry);
+ 		}
+ 	}
+ 	release_dentry_name_snapshot(&old_name);
+ 
+ 	return error;
+ }
+ EXPORT_SYMBOL(vfs_rename);
+ 
+ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
+ 			const char __user *newname, unsigned int flags)
+ {
+ 	struct dentry *old_dentry, *new_dentry;
+ 	struct dentry *trap;
+ 	struct path old_path, new_path;
+ 	struct qstr old_last, new_last;
+ 	int old_type, new_type;
+ 	struct inode *delegated_inode = NULL;
+ 	struct filename *from;
+ 	struct filename *to;
+ 	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
+ 	bool should_retry = false;
+ 	int error;
+ 
+ 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
+ 		return -EINVAL;
+ 
+ 	if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
+ 	    (flags & RENAME_EXCHANGE))
+ 		return -EINVAL;
+ 
+ 	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+ 		return -EPERM;
+ 
+ 	if (flags & RENAME_EXCHANGE)
+ 		target_flags = 0;
+ 
+ retry:
+ 	from = filename_parentat(olddfd, getname(oldname), lookup_flags,
+ 				&old_path, &old_last, &old_type);
+ 	if (IS_ERR(from)) {
+ 		error = PTR_ERR(from);
+ 		goto exit;
+ 	}
+ 
+ 	to = filename_parentat(newdfd, getname(newname), lookup_flags,
+ 				&new_path, &new_last, &new_type);
+ 	if (IS_ERR(to)) {
+ 		error = PTR_ERR(to);
+ 		goto exit1;
+ 	}
+ 
+ 	error = -EXDEV;
+ 	if (old_path.mnt != new_path.mnt)
+ 		goto exit2;
+ 
+ 	error = -EBUSY;
+ 	if (old_type != LAST_NORM)
+ 		goto exit2;
+ 
+ 	if (flags & RENAME_NOREPLACE)
+ 		error = -EEXIST;
+ 	if (new_type != LAST_NORM)
+ 		goto exit2;
+ 
+ 	error = mnt_want_write(old_path.mnt);
+ 	if (error)
+ 		goto exit2;
+ 
+ retry_deleg:
+ 	trap = lock_rename(new_path.dentry, old_path.dentry);
+ 
+ 	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
+ 	error = PTR_ERR(old_dentry);
+ 	if (IS_ERR(old_dentry))
+ 		goto exit3;
+ 	/* source must exist */
+ 	error = -ENOENT;
+ 	if (d_is_negative(old_dentry))
+ 		goto exit4;
+ 	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
+ 	error = PTR_ERR(new_dentry);
+ 	if (IS_ERR(new_dentry))
+ 		goto exit4;
+ 	error = -EEXIST;
+ 	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
+ 		goto exit5;
+ 	if (flags & RENAME_EXCHANGE) {
+ 		error = -ENOENT;
+ 		if (d_is_negative(new_dentry))
+ 			goto exit5;
+ 
+ 		if (!d_is_dir(new_dentry)) {
+ 			error = -ENOTDIR;
+ 			if (new_last.name[new_last.len])
+ 				goto exit5;
+ 		}
+ 	}
+ 	/* unless the source is a directory trailing slashes give -ENOTDIR */
+ 	if (!d_is_dir(old_dentry)) {
+ 		error = -ENOTDIR;
+ 		if (old_last.name[old_last.len])
+ 			goto exit5;
+ 		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
+ 			goto exit5;
+ 	}
+ 	/* source should not be ancestor of target */
+ 	error = -EINVAL;
+ 	if (old_dentry == trap)
+ 		goto exit5;
+ 	/* target should not be an ancestor of source */
+ 	if (!(flags & RENAME_EXCHANGE))
+ 		error = -ENOTEMPTY;
+ 	if (new_dentry == trap)
+ 		goto exit5;
+ 
+ 	error = security_path_rename(&old_path, old_dentry,
+ 				     &new_path, new_dentry, flags);
+ 	if (error)
+ 		goto exit5;
+ 	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+ 			   new_path.dentry->d_inode, new_dentry,
+ 			   &delegated_inode, flags);
+ exit5:
+ 	dput(new_dentry);
+ exit4:
+ 	dput(old_dentry);
+ exit3:
+ 	unlock_rename(new_path.dentry, old_path.dentry);
+ 	if (delegated_inode) {
+ 		error = break_deleg_wait(&delegated_inode);
+ 		if (!error)
+ 			goto retry_deleg;
+ 	}
+ 	mnt_drop_write(old_path.mnt);
+ exit2:
+ 	if (retry_estale(error, lookup_flags))
+ 		should_retry = true;
+ 	path_put(&new_path);
+ 	putname(to);
+ exit1:
+ 	path_put(&old_path);
+ 	putname(from);
+ 	if (should_retry) {
+ 		should_retry = false;
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ exit:
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
+ 		int, newdfd, const char __user *, newname, unsigned int, flags)
+ {
+ 	return do_renameat2(olddfd, oldname, newdfd, newname, flags);
+ }
+ 
+ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+ 		int, newdfd, const char __user *, newname)
+ {
+ 	return do_renameat2(olddfd, oldname, newdfd, newname, 0);
+ }
+ 
+ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
+ {
+ 	return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+ }
+ 
+ int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+ {
+ 	int error = may_create(dir, dentry);
+ 	if (error)
+ 		return error;
+ 
+ 	if (!dir->i_op->mknod)
+ 		return -EPERM;
+ 
+ 	return dir->i_op->mknod(dir, dentry,
+ 				S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+ }
+ EXPORT_SYMBOL(vfs_whiteout);
+ 
+ int readlink_copy(char __user *buffer, int buflen, const char *link)
+ {
+ 	int len = PTR_ERR(link);
+ 	if (IS_ERR(link))
+ 		goto out;
+ 
+ 	len = strlen(link);
+ 	if (len > (unsigned) buflen)
+ 		len = buflen;
+ 	if (copy_to_user(buffer, link, len))
+ 		len = -EFAULT;
+ out:
+ 	return len;
+ }
+ 
+ /**
+  * vfs_readlink - copy symlink body into userspace buffer
+  * @dentry: dentry on which to get symbolic link
+  * @buffer: user memory pointer
+  * @buflen: size of buffer
+  *
+  * Does not touch atime.  That's up to the caller if necessary
+  *
+  * Does not call security hook.
+  */
+ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+ {
+ 	struct inode *inode = d_inode(dentry);
+ 	DEFINE_DELAYED_CALL(done);
+ 	const char *link;
+ 	int res;
+ 
+ 	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
+ 		if (unlikely(inode->i_op->readlink))
+ 			return inode->i_op->readlink(dentry, buffer, buflen);
+ 
+ 		if (!d_is_symlink(dentry))
+ 			return -EINVAL;
+ 
+ 		spin_lock(&inode->i_lock);
+ 		inode->i_opflags |= IOP_DEFAULT_READLINK;
+ 		spin_unlock(&inode->i_lock);
+ 	}
+ 
+ 	link = READ_ONCE(inode->i_link);
+ 	if (!link) {
+ 		link = inode->i_op->get_link(dentry, inode, &done);
+ 		if (IS_ERR(link))
+ 			return PTR_ERR(link);
+ 	}
+ 	res = readlink_copy(buffer, buflen, link);
+ 	do_delayed_call(&done);
+ 	return res;
+ }
+ EXPORT_SYMBOL(vfs_readlink);
+ 
+ /**
+  * vfs_get_link - get symlink body
+  * @dentry: dentry on which to get symbolic link
+  * @done: caller needs to free returned data with this
+  *
+  * Calls security hook and i_op->get_link() on the supplied inode.
+  *
+  * It does not touch atime.  That's up to the caller if necessary.
+  *
+  * Does not work on "special" symlinks like /proc/$$/fd/N
+  */
+ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
+ {
+ 	const char *res = ERR_PTR(-EINVAL);
+ 	struct inode *inode = d_inode(dentry);
+ 
+ 	if (d_is_symlink(dentry)) {
+ 		res = ERR_PTR(security_inode_readlink(dentry));
+ 		if (!res)
+ 			res = inode->i_op->get_link(dentry, inode, done);
+ 	}
+ 	return res;
+ }
+ EXPORT_SYMBOL(vfs_get_link);
+ 
+ /* get the link contents into pagecache */
+ const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ 			  struct delayed_call *callback)
+ {
+ 	char *kaddr;
+ 	struct page *page;
+ 	struct address_space *mapping = inode->i_mapping;
+ 
+ 	if (!dentry) {
+ 		page = find_get_page(mapping, 0);
+ 		if (!page)
+ 			return ERR_PTR(-ECHILD);
+ 		if (!PageUptodate(page)) {
+ 			put_page(page);
+ 			return ERR_PTR(-ECHILD);
+ 		}
+ 	} else {
+ 		page = read_mapping_page(mapping, 0, NULL);
+ 		if (IS_ERR(page))
+ 			return (char*)page;
+ 	}
+ 	set_delayed_call(callback, page_put_link, page);
+ 	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+ 	kaddr = page_address(page);
+ 	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
+ 	return kaddr;
+ }
+ 
+ EXPORT_SYMBOL(page_get_link);
+ 
+ void page_put_link(void *arg)
+ {
+ 	put_page(arg);
+ }
+ EXPORT_SYMBOL(page_put_link);
+ 
+ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+ {
+ 	DEFINE_DELAYED_CALL(done);
+ 	int res = readlink_copy(buffer, buflen,
+ 				page_get_link(dentry, d_inode(dentry),
+ 					      &done));
+ 	do_delayed_call(&done);
+ 	return res;
+ }
+ EXPORT_SYMBOL(page_readlink);
+ 
+ /*
+  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+  */
+ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
+ {
+ 	struct address_space *mapping = inode->i_mapping;
+ 	struct page *page;
+ 	void *fsdata;
+ 	int err;
+ 	unsigned int flags = 0;
+ 	if (nofs)
+ 		flags |= AOP_FLAG_NOFS;
+ 
+ retry:
+ 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
+ 				flags, &page, &fsdata);
+ 	if (err)
+ 		goto fail;
+ 
+ 	memcpy(page_address(page), symname, len-1);
+ 
+ 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
+ 							page, fsdata);
+ 	if (err < 0)
+ 		goto fail;
+ 	if (err < len-1)
+ 		goto retry;
+ 
+ 	mark_inode_dirty(inode);
+ 	return 0;
+ fail:
+ 	return err;
+ }
+ EXPORT_SYMBOL(__page_symlink);
+ 
+ int page_symlink(struct inode *inode, const char *symname, int len)
+ {
+ 	return __page_symlink(inode, symname, len,
+ 			!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
+ }
+ EXPORT_SYMBOL(page_symlink);
+ 
+ const struct inode_operations page_symlink_inode_operations = {
+ 	.get_link	= page_get_link,
+ };
+ EXPORT_SYMBOL(page_symlink_inode_operations);
diff --color -rcNP Master/fs/namei.c.rej OG/fs/namei.c.rej
*** Master/fs/namei.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/namei.c.rej	2021-04-20 15:11:27.316000000 -0400
***************
*** 0 ****
--- 1,21 ----
+ *** fs/namei.c	2021-03-13 14:14:32.000000000 +0200
+ --- fs/namei.c	2021-03-13 19:47:09.000000000 +0200
+ ***************
+ *** 52,59 ****
+    * The new code replaces the old recursive symlink resolution with
+    * an iterative one (in case of non-nested symlink chains).  It does
+    * this with calls to <fs>_follow_link().
+ !  * As a side effect, dir_namei(), _namei() and follow_link() are now
+ !  * replaced with a single function lookup_dentry() that can handle all
+    * the special cases of the former code.
+    *
+    * With the new dcache, the pathname is stored at each inode, at least as
+ --- 51,58 ----
+    * The new code replaces the old recursive symlink resolution with
+    * an iterative one (in case of non-nested symlink chains).  It does
+    * this with calls to <fs>_follow_link().
+ !  * As a side effect, dir_namei(), _namei() and follow_link() are now
+ !  * replaced with a single function lookup_dentry() that can handle all
+    * the special cases of the former code.
+    *
+    * With the new dcache, the pathname is stored at each inode, at least as
diff --color -rcNP Master/fs/namespace.c OG/fs/namespace.c
*** Master/fs/namespace.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/namespace.c	2021-04-20 15:11:34.510000000 -0400
***************
*** 30,35 ****
--- 30,36 ----
  #include <uapi/linux/mount.h>
  #include <linux/fs_context.h>
  #include <linux/shmem_fs.h>
+ #include <linux/minisec.h>
  
  #include "pnode.h"
  #include "internal.h"
***************
*** 3130,3135 ****
--- 3131,3140 ----
  			    SB_LAZYTIME |
  			    SB_I_VERSION);
  
+ 	if (gr_handle_chroot_mount(path->dentry, path->mnt, dev_name)) {
+ 		return -EPERM;
+ 	}
+ 
  	if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
  		retval = do_reconfigure_mnt(&path, mnt_flags);
  	else if (flags & MS_REMOUNT)
***************
*** 3641,3646 ****
--- 3646,3655 ----
  	if (error)
  		goto out2;
  
+ 	if (gr_handle_chroot_pivot()) {
+ 		error = -EPERM;
+ 		goto out2;
+ 	}
  	get_fs_root(current->fs, &root);
  	old_mp = lock_mount(&old);
  	error = PTR_ERR(old_mp);
diff --color -rcNP Master/fs/open.c OG/fs/open.c
*** Master/fs/open.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/open.c	2021-04-20 15:11:34.511000000 -0400
***************
*** 32,37 ****
--- 32,38 ----
  #include <linux/ima.h>
  #include <linux/dnotify.h>
  #include <linux/compat.h>
+ #include <linux/minisec.h>
  
  #include "internal.h"
  
***************
*** 524,530 ****
--- 525,536 ----
  	if (error)
  		goto dput_and_out;
  
+ 	if (gr_handle_chroot_chroot(path.dentry, path.mnt))
+ 		goto dput_and_out;
  	set_fs_root(current->fs, &path);
+ 
+ 	gr_handle_chroot_chdir(&path);
+ 
  	error = 0;
  dput_and_out:
  	path_put(&path);
***************
*** 553,558 ****
--- 559,570 ----
  		return error;
  retry_deleg:
  	inode_lock(inode);
+ 
+ 	if (gr_handle_chroot_chmod(path->dentry, path->mnt, mode)) {
+ 		error = -EACCES;
+ 		goto out_unlock;
+ 	}
+ 
  	error = security_path_chmod(path, mode);
  	if (error)
  		goto out_unlock;
***************
*** 1056,1062 ****
  {
  	struct filename *name = getname_kernel(filename);
  	struct file *file = ERR_CAST(name);
! 	
  	if (!IS_ERR(name)) {
  		file = file_open_name(name, flags, mode);
  		putname(name);
--- 1068,1074 ----
  {
  	struct filename *name = getname_kernel(filename);
  	struct file *file = ERR_CAST(name);
! 
  	if (!IS_ERR(name)) {
  		file = file_open_name(name, flags, mode);
  		putname(name);
diff --color -rcNP Master/fs/open.c.orig OG/fs/open.c.orig
*** Master/fs/open.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/open.c.orig	2021-04-20 15:10:45.381000000 -0400
***************
*** 0 ****
--- 1,1273 ----
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  *  linux/fs/open.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ #include <linux/string.h>
+ #include <linux/mm.h>
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/fsnotify.h>
+ #include <linux/module.h>
+ #include <linux/tty.h>
+ #include <linux/namei.h>
+ #include <linux/backing-dev.h>
+ #include <linux/capability.h>
+ #include <linux/securebits.h>
+ #include <linux/security.h>
+ #include <linux/mount.h>
+ #include <linux/fcntl.h>
+ #include <linux/slab.h>
+ #include <linux/uaccess.h>
+ #include <linux/fs.h>
+ #include <linux/personality.h>
+ #include <linux/pagemap.h>
+ #include <linux/syscalls.h>
+ #include <linux/rcupdate.h>
+ #include <linux/audit.h>
+ #include <linux/falloc.h>
+ #include <linux/fs_struct.h>
+ #include <linux/ima.h>
+ #include <linux/dnotify.h>
+ #include <linux/compat.h>
+ #include <linux/minisec.h>
+ 
+ #include "internal.h"
+ 
+ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+ 	struct file *filp)
+ {
+ 	int ret;
+ 	struct iattr newattrs;
+ 
+ 	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
+ 	if (length < 0)
+ 		return -EINVAL;
+ 
+ 	newattrs.ia_size = length;
+ 	newattrs.ia_valid = ATTR_SIZE | time_attrs;
+ 	if (filp) {
+ 		newattrs.ia_file = filp;
+ 		newattrs.ia_valid |= ATTR_FILE;
+ 	}
+ 
+ 	/* Remove suid, sgid, and file capabilities on truncate too */
+ 	ret = dentry_needs_remove_privs(dentry);
+ 	if (ret < 0)
+ 		return ret;
+ 	if (ret)
+ 		newattrs.ia_valid |= ret | ATTR_FORCE;
+ 
+ 	inode_lock(dentry->d_inode);
+ 	/* Note any delegations or leases have already been broken: */
+ 	ret = notify_change(dentry, &newattrs, NULL);
+ 	inode_unlock(dentry->d_inode);
+ 	return ret;
+ }
+ 
+ long vfs_truncate(const struct path *path, loff_t length)
+ {
+ 	struct inode *inode;
+ 	long error;
+ 
+ 	inode = path->dentry->d_inode;
+ 
+ 	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
+ 	if (S_ISDIR(inode->i_mode))
+ 		return -EISDIR;
+ 	if (!S_ISREG(inode->i_mode))
+ 		return -EINVAL;
+ 
+ 	error = mnt_want_write(path->mnt);
+ 	if (error)
+ 		goto out;
+ 
+ 	error = inode_permission(inode, MAY_WRITE);
+ 	if (error)
+ 		goto mnt_drop_write_and_out;
+ 
+ 	error = -EPERM;
+ 	if (IS_APPEND(inode))
+ 		goto mnt_drop_write_and_out;
+ 
+ 	error = get_write_access(inode);
+ 	if (error)
+ 		goto mnt_drop_write_and_out;
+ 
+ 	/*
+ 	 * Make sure that there are no leases.  get_write_access() protects
+ 	 * against the truncate racing with a lease-granting setlease().
+ 	 */
+ 	error = break_lease(inode, O_WRONLY);
+ 	if (error)
+ 		goto put_write_and_out;
+ 
+ 	error = locks_verify_truncate(inode, NULL, length);
+ 	if (!error)
+ 		error = security_path_truncate(path);
+ 	if (!error)
+ 		error = do_truncate(path->dentry, length, 0, NULL);
+ 
+ put_write_and_out:
+ 	put_write_access(inode);
+ mnt_drop_write_and_out:
+ 	mnt_drop_write(path->mnt);
+ out:
+ 	return error;
+ }
+ EXPORT_SYMBOL_GPL(vfs_truncate);
+ 
+ long do_sys_truncate(const char __user *pathname, loff_t length)
+ {
+ 	unsigned int lookup_flags = LOOKUP_FOLLOW;
+ 	struct path path;
+ 	int error;
+ 
+ 	if (length < 0)	/* sorry, but loff_t says... */
+ 		return -EINVAL;
+ 
+ retry:
+ 	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ 	if (!error) {
+ 		error = vfs_truncate(&path, length);
+ 		path_put(&path);
+ 	}
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
+ {
+ 	return do_sys_truncate(path, length);
+ }
+ 
+ #ifdef CONFIG_COMPAT
+ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
+ {
+ 	return do_sys_truncate(path, length);
+ }
+ #endif
+ 
+ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+ {
+ 	struct inode *inode;
+ 	struct dentry *dentry;
+ 	struct fd f;
+ 	int error;
+ 
+ 	error = -EINVAL;
+ 	if (length < 0)
+ 		goto out;
+ 	error = -EBADF;
+ 	f = fdget(fd);
+ 	if (!f.file)
+ 		goto out;
+ 
+ 	/* explicitly opened as large or we are on 64-bit box */
+ 	if (f.file->f_flags & O_LARGEFILE)
+ 		small = 0;
+ 
+ 	dentry = f.file->f_path.dentry;
+ 	inode = dentry->d_inode;
+ 	error = -EINVAL;
+ 	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
+ 		goto out_putf;
+ 
+ 	error = -EINVAL;
+ 	/* Cannot ftruncate over 2^31 bytes without large file support */
+ 	if (small && length > MAX_NON_LFS)
+ 		goto out_putf;
+ 
+ 	error = -EPERM;
+ 	/* Check IS_APPEND on real upper inode */
+ 	if (IS_APPEND(file_inode(f.file)))
+ 		goto out_putf;
+ 
+ 	sb_start_write(inode->i_sb);
+ 	error = locks_verify_truncate(inode, f.file, length);
+ 	if (!error)
+ 		error = security_path_truncate(&f.file->f_path);
+ 	if (!error)
+ 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+ 	sb_end_write(inode->i_sb);
+ out_putf:
+ 	fdput(f);
+ out:
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+ {
+ 	return do_sys_ftruncate(fd, length, 1);
+ }
+ 
+ #ifdef CONFIG_COMPAT
+ COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+ {
+ 	return do_sys_ftruncate(fd, length, 1);
+ }
+ #endif
+ 
+ /* LFS versions of truncate are only needed on 32 bit machines */
+ #if BITS_PER_LONG == 32
+ SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
+ {
+ 	return do_sys_truncate(path, length);
+ }
+ 
+ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
+ {
+ 	return do_sys_ftruncate(fd, length, 0);
+ }
+ #endif /* BITS_PER_LONG == 32 */
+ 
+ 
+ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	long ret;
+ 
+ 	if (offset < 0 || len <= 0)
+ 		return -EINVAL;
+ 
+ 	/* Return error if mode is not supported */
+ 	if (mode & ~FALLOC_FL_SUPPORTED_MASK)
+ 		return -EOPNOTSUPP;
+ 
+ 	/* Punch hole and zero range are mutually exclusive */
+ 	if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
+ 	    (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
+ 		return -EOPNOTSUPP;
+ 
+ 	/* Punch hole must have keep size set */
+ 	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
+ 	    !(mode & FALLOC_FL_KEEP_SIZE))
+ 		return -EOPNOTSUPP;
+ 
+ 	/* Collapse range should only be used exclusively. */
+ 	if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
+ 	    (mode & ~FALLOC_FL_COLLAPSE_RANGE))
+ 		return -EINVAL;
+ 
+ 	/* Insert range should only be used exclusively. */
+ 	if ((mode & FALLOC_FL_INSERT_RANGE) &&
+ 	    (mode & ~FALLOC_FL_INSERT_RANGE))
+ 		return -EINVAL;
+ 
+ 	/* Unshare range should only be used with allocate mode. */
+ 	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
+ 	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
+ 		return -EINVAL;
+ 
+ 	if (!(file->f_mode & FMODE_WRITE))
+ 		return -EBADF;
+ 
+ 	/*
+ 	 * We can only allow pure fallocate on append only files
+ 	 */
+ 	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
+ 		return -EPERM;
+ 
+ 	if (IS_IMMUTABLE(inode))
+ 		return -EPERM;
+ 
+ 	/*
+ 	 * We cannot allow any fallocate operation on an active swapfile
+ 	 */
+ 	if (IS_SWAPFILE(inode))
+ 		return -ETXTBSY;
+ 
+ 	/*
+ 	 * Revalidate the write permissions, in case security policy has
+ 	 * changed since the files were opened.
+ 	 */
+ 	ret = security_file_permission(file, MAY_WRITE);
+ 	if (ret)
+ 		return ret;
+ 
+ 	if (S_ISFIFO(inode->i_mode))
+ 		return -ESPIPE;
+ 
+ 	if (S_ISDIR(inode->i_mode))
+ 		return -EISDIR;
+ 
+ 	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ 		return -ENODEV;
+ 
+ 	/* Check for wrap through zero too */
+ 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
+ 		return -EFBIG;
+ 
+ 	if (!file->f_op->fallocate)
+ 		return -EOPNOTSUPP;
+ 
+ 	file_start_write(file);
+ 	ret = file->f_op->fallocate(file, mode, offset, len);
+ 
+ 	/*
+ 	 * Create inotify and fanotify events.
+ 	 *
+ 	 * To keep the logic simple always create events if fallocate succeeds.
+ 	 * This implies that events are even created if the file size remains
+ 	 * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
+ 	 */
+ 	if (ret == 0)
+ 		fsnotify_modify(file);
+ 
+ 	file_end_write(file);
+ 	return ret;
+ }
+ EXPORT_SYMBOL_GPL(vfs_fallocate);
+ 
+ int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+ {
+ 	struct fd f = fdget(fd);
+ 	int error = -EBADF;
+ 
+ 	if (f.file) {
+ 		error = vfs_fallocate(f.file, mode, offset, len);
+ 		fdput(f);
+ 	}
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
+ {
+ 	return ksys_fallocate(fd, mode, offset, len);
+ }
+ 
+ /*
+  * access() needs to use the real uid/gid, not the effective uid/gid.
+  * We do this by temporarily clearing all FS-related capabilities and
+  * switching the fsuid/fsgid around to the real ones.
+  */
+ long do_faccessat(int dfd, const char __user *filename, int mode)
+ {
+ 	const struct cred *old_cred;
+ 	struct cred *override_cred;
+ 	struct path path;
+ 	struct inode *inode;
+ 	int res;
+ 	unsigned int lookup_flags = LOOKUP_FOLLOW;
+ 
+ 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
+ 		return -EINVAL;
+ 
+ 	override_cred = prepare_creds();
+ 	if (!override_cred)
+ 		return -ENOMEM;
+ 
+ 	override_cred->fsuid = override_cred->uid;
+ 	override_cred->fsgid = override_cred->gid;
+ 
+ 	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ 		/* Clear the capabilities if we switch to a non-root user */
+ 		kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
+ 		if (!uid_eq(override_cred->uid, root_uid))
+ 			cap_clear(override_cred->cap_effective);
+ 		else
+ 			override_cred->cap_effective =
+ 				override_cred->cap_permitted;
+ 	}
+ 
+ 	/*
+ 	 * The new set of credentials can *only* be used in
+ 	 * task-synchronous circumstances, and does not need
+ 	 * RCU freeing, unless somebody then takes a separate
+ 	 * reference to it.
+ 	 *
+ 	 * NOTE! This is _only_ true because this credential
+ 	 * is used purely for override_creds() that installs
+ 	 * it as the subjective cred. Other threads will be
+ 	 * accessing ->real_cred, not the subjective cred.
+ 	 *
+ 	 * If somebody _does_ make a copy of this (using the
+ 	 * 'get_current_cred()' function), that will clear the
+ 	 * non_rcu field, because now that other user may be
+ 	 * expecting RCU freeing. But normal thread-synchronous
+ 	 * cred accesses will keep things non-RCY.
+ 	 */
+ 	override_cred->non_rcu = 1;
+ 
+ 	old_cred = override_creds(override_cred);
+ retry:
+ 	res = user_path_at(dfd, filename, lookup_flags, &path);
+ 	if (res)
+ 		goto out;
+ 
+ 	inode = d_backing_inode(path.dentry);
+ 
+ 	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+ 		/*
+ 		 * MAY_EXEC on regular files is denied if the fs is mounted
+ 		 * with the "noexec" flag.
+ 		 */
+ 		res = -EACCES;
+ 		if (path_noexec(&path))
+ 			goto out_path_release;
+ 	}
+ 
+ 	res = inode_permission(inode, mode | MAY_ACCESS);
+ 	/* SuS v2 requires we report a read only fs too */
+ 	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
+ 		goto out_path_release;
+ 	/*
+ 	 * This is a rare case where using __mnt_is_readonly()
+ 	 * is OK without a mnt_want/drop_write() pair.  Since
+ 	 * no actual write to the fs is performed here, we do
+ 	 * not need to telegraph to that to anyone.
+ 	 *
+ 	 * By doing this, we accept that this access is
+ 	 * inherently racy and know that the fs may change
+ 	 * state before we even see this result.
+ 	 */
+ 	if (__mnt_is_readonly(path.mnt))
+ 		res = -EROFS;
+ 
+ out_path_release:
+ 	path_put(&path);
+ 	if (retry_estale(res, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ out:
+ 	revert_creds(old_cred);
+ 	put_cred(override_cred);
+ 	return res;
+ }
+ 
+ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
+ {
+ 	return do_faccessat(dfd, filename, mode);
+ }
+ 
+ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
+ {
+ 	return do_faccessat(AT_FDCWD, filename, mode);
+ }
+ 
+ int ksys_chdir(const char __user *filename)
+ {
+ 	struct path path;
+ 	int error;
+ 	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+ retry:
+ 	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
+ 	if (error)
+ 		goto out;
+ 
+ 	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ 	if (error)
+ 		goto dput_and_out;
+ 
+ 	set_fs_pwd(current->fs, &path);
+ 
+ dput_and_out:
+ 	path_put(&path);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ out:
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE1(chdir, const char __user *, filename)
+ {
+ 	return ksys_chdir(filename);
+ }
+ 
+ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
+ {
+ 	struct fd f = fdget_raw(fd);
+ 	int error;
+ 
+ 	error = -EBADF;
+ 	if (!f.file)
+ 		goto out;
+ 
+ 	error = -ENOTDIR;
+ 	if (!d_can_lookup(f.file->f_path.dentry))
+ 		goto out_putf;
+ 
+ 	error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+ 	if (!error)
+ 		set_fs_pwd(current->fs, &f.file->f_path);
+ out_putf:
+ 	fdput(f);
+ out:
+ 	return error;
+ }
+ 
+ int ksys_chroot(const char __user *filename)
+ {
+ 	struct path path;
+ 	int error;
+ 	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+ retry:
+ 	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
+ 	if (error)
+ 		goto out;
+ 
+ 	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+ 	if (error)
+ 		goto dput_and_out;
+ 
+ 	error = -EPERM;
+ 	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
+ 		goto dput_and_out;
+ 	error = security_path_chroot(&path);
+ 	if (error)
+ 		goto dput_and_out;
+ 
+ 	if (gr_handle_chroot_chroot(path.dentry, path.mnt))
+ 		goto dput_and_out;
+ 	set_fs_root(current->fs, &path);
+ 
+ 	gr_handle_chroot_chdir(&path);
+ 
+ 	error = 0;
+ dput_and_out:
+ 	path_put(&path);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ out:
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE1(chroot, const char __user *, filename)
+ {
+ 	return ksys_chroot(filename);
+ }
+ 
+ static int chmod_common(const struct path *path, umode_t mode)
+ {
+ 	struct inode *inode = path->dentry->d_inode;
+ 	struct inode *delegated_inode = NULL;
+ 	struct iattr newattrs;
+ 	int error;
+ 
+ 	error = mnt_want_write(path->mnt);
+ 	if (error)
+ 		return error;
+ retry_deleg:
+ 	inode_lock(inode);
+ 
+ 	if (gr_handle_chroot_chmod(path->dentry, path->mnt, mode)) {
+ 		error = -EACCES;
+ 		goto out_unlock;
+ 	}
+ 
+ 	error = security_path_chmod(path, mode);
+ 	if (error)
+ 		goto out_unlock;
+ 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+ 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ out_unlock:
+ 	inode_unlock(inode);
+ 	if (delegated_inode) {
+ 		error = break_deleg_wait(&delegated_inode);
+ 		if (!error)
+ 			goto retry_deleg;
+ 	}
+ 	mnt_drop_write(path->mnt);
+ 	return error;
+ }
+ 
+ int ksys_fchmod(unsigned int fd, umode_t mode)
+ {
+ 	struct fd f = fdget(fd);
+ 	int err = -EBADF;
+ 
+ 	if (f.file) {
+ 		audit_file(f.file);
+ 		err = chmod_common(&f.file->f_path, mode);
+ 		fdput(f);
+ 	}
+ 	return err;
+ }
+ 
+ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
+ {
+ 	return ksys_fchmod(fd, mode);
+ }
+ 
+ int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
+ {
+ 	struct path path;
+ 	int error;
+ 	unsigned int lookup_flags = LOOKUP_FOLLOW;
+ retry:
+ 	error = user_path_at(dfd, filename, lookup_flags, &path);
+ 	if (!error) {
+ 		error = chmod_common(&path, mode);
+ 		path_put(&path);
+ 		if (retry_estale(error, lookup_flags)) {
+ 			lookup_flags |= LOOKUP_REVAL;
+ 			goto retry;
+ 		}
+ 	}
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
+ 		umode_t, mode)
+ {
+ 	return do_fchmodat(dfd, filename, mode);
+ }
+ 
+ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
+ {
+ 	return do_fchmodat(AT_FDCWD, filename, mode);
+ }
+ 
+ static int chown_common(const struct path *path, uid_t user, gid_t group)
+ {
+ 	struct inode *inode = path->dentry->d_inode;
+ 	struct inode *delegated_inode = NULL;
+ 	int error;
+ 	struct iattr newattrs;
+ 	kuid_t uid;
+ 	kgid_t gid;
+ 
+ 	uid = make_kuid(current_user_ns(), user);
+ 	gid = make_kgid(current_user_ns(), group);
+ 
+ retry_deleg:
+ 	newattrs.ia_valid =  ATTR_CTIME;
+ 	if (user != (uid_t) -1) {
+ 		if (!uid_valid(uid))
+ 			return -EINVAL;
+ 		newattrs.ia_valid |= ATTR_UID;
+ 		newattrs.ia_uid = uid;
+ 	}
+ 	if (group != (gid_t) -1) {
+ 		if (!gid_valid(gid))
+ 			return -EINVAL;
+ 		newattrs.ia_valid |= ATTR_GID;
+ 		newattrs.ia_gid = gid;
+ 	}
+ 	if (!S_ISDIR(inode->i_mode))
+ 		newattrs.ia_valid |=
+ 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ 	inode_lock(inode);
+ 	error = security_path_chown(path, uid, gid);
+ 	if (!error)
+ 		error = notify_change(path->dentry, &newattrs, &delegated_inode);
+ 	inode_unlock(inode);
+ 	if (delegated_inode) {
+ 		error = break_deleg_wait(&delegated_inode);
+ 		if (!error)
+ 			goto retry_deleg;
+ 	}
+ 	return error;
+ }
+ 
+ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
+ 		int flag)
+ {
+ 	struct path path;
+ 	int error = -EINVAL;
+ 	int lookup_flags;
+ 
+ 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ 		goto out;
+ 
+ 	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ 	if (flag & AT_EMPTY_PATH)
+ 		lookup_flags |= LOOKUP_EMPTY;
+ retry:
+ 	error = user_path_at(dfd, filename, lookup_flags, &path);
+ 	if (error)
+ 		goto out;
+ 	error = mnt_want_write(path.mnt);
+ 	if (error)
+ 		goto out_release;
+ 	error = chown_common(&path, user, group);
+ 	mnt_drop_write(path.mnt);
+ out_release:
+ 	path_put(&path);
+ 	if (retry_estale(error, lookup_flags)) {
+ 		lookup_flags |= LOOKUP_REVAL;
+ 		goto retry;
+ 	}
+ out:
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+ 		gid_t, group, int, flag)
+ {
+ 	return do_fchownat(dfd, filename, user, group, flag);
+ }
+ 
+ SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
+ {
+ 	return do_fchownat(AT_FDCWD, filename, user, group, 0);
+ }
+ 
+ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
+ {
+ 	return do_fchownat(AT_FDCWD, filename, user, group,
+ 			   AT_SYMLINK_NOFOLLOW);
+ }
+ 
+ int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
+ {
+ 	struct fd f = fdget(fd);
+ 	int error = -EBADF;
+ 
+ 	if (!f.file)
+ 		goto out;
+ 
+ 	error = mnt_want_write_file(f.file);
+ 	if (error)
+ 		goto out_fput;
+ 	audit_file(f.file);
+ 	error = chown_common(&f.file->f_path, user, group);
+ 	mnt_drop_write_file(f.file);
+ out_fput:
+ 	fdput(f);
+ out:
+ 	return error;
+ }
+ 
+ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
+ {
+ 	return ksys_fchown(fd, user, group);
+ }
+ 
+ static int do_dentry_open(struct file *f,
+ 			  struct inode *inode,
+ 			  int (*open)(struct inode *, struct file *))
+ {
+ 	static const struct file_operations empty_fops = {};
+ 	int error;
+ 
+ 	path_get(&f->f_path);
+ 	f->f_inode = inode;
+ 	f->f_mapping = inode->i_mapping;
+ 
+ 	/* Ensure that we skip any errors that predate opening of the file */
+ 	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
+ 
+ 	if (unlikely(f->f_flags & O_PATH)) {
+ 		f->f_mode = FMODE_PATH | FMODE_OPENED;
+ 		f->f_op = &empty_fops;
+ 		return 0;
+ 	}
+ 
+ 	/* Any file opened for execve()/uselib() has to be a regular file. */
+ 	if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
+ 		error = -EACCES;
+ 		goto cleanup_file;
+ 	}
+ 
+ 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ 		error = get_write_access(inode);
+ 		if (unlikely(error))
+ 			goto cleanup_file;
+ 		error = __mnt_want_write(f->f_path.mnt);
+ 		if (unlikely(error)) {
+ 			put_write_access(inode);
+ 			goto cleanup_file;
+ 		}
+ 		f->f_mode |= FMODE_WRITER;
+ 	}
+ 
+ 	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+ 	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
+ 		f->f_mode |= FMODE_ATOMIC_POS;
+ 
+ 	f->f_op = fops_get(inode->i_fop);
+ 	if (WARN_ON(!f->f_op)) {
+ 		error = -ENODEV;
+ 		goto cleanup_all;
+ 	}
+ 
+ 	error = security_file_open(f);
+ 	if (error)
+ 		goto cleanup_all;
+ 
+ 	error = break_lease(locks_inode(f), f->f_flags);
+ 	if (error)
+ 		goto cleanup_all;
+ 
+ 	/* normally all 3 are set; ->open() can clear them if needed */
+ 	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ 	if (!open)
+ 		open = f->f_op->open;
+ 	if (open) {
+ 		error = open(inode, f);
+ 		if (error)
+ 			goto cleanup_all;
+ 	}
+ 	f->f_mode |= FMODE_OPENED;
+ 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+ 		i_readcount_inc(inode);
+ 	if ((f->f_mode & FMODE_READ) &&
+ 	     likely(f->f_op->read || f->f_op->read_iter))
+ 		f->f_mode |= FMODE_CAN_READ;
+ 	if ((f->f_mode & FMODE_WRITE) &&
+ 	     likely(f->f_op->write || f->f_op->write_iter))
+ 		f->f_mode |= FMODE_CAN_WRITE;
+ 
+ 	f->f_write_hint = WRITE_LIFE_NOT_SET;
+ 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+ 
+ 	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+ 
+ 	/* NB: we're sure to have correct a_ops only after f_op->open */
+ 	if (f->f_flags & O_DIRECT) {
+ 		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
+ 			return -EINVAL;
+ 	}
+ 
+ 	/*
+ 	 * XXX: Huge page cache doesn't support writing yet. Drop all page
+ 	 * cache for this file before processing writes.
+ 	 */
+ 	if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
+ 		truncate_pagecache(inode, 0);
+ 
+ 	return 0;
+ 
+ cleanup_all:
+ 	if (WARN_ON_ONCE(error > 0))
+ 		error = -EINVAL;
+ 	fops_put(f->f_op);
+ 	if (f->f_mode & FMODE_WRITER) {
+ 		put_write_access(inode);
+ 		__mnt_drop_write(f->f_path.mnt);
+ 	}
+ cleanup_file:
+ 	path_put(&f->f_path);
+ 	f->f_path.mnt = NULL;
+ 	f->f_path.dentry = NULL;
+ 	f->f_inode = NULL;
+ 	return error;
+ }
+ 
+ /**
+  * finish_open - finish opening a file
+  * @file: file pointer
+  * @dentry: pointer to dentry
+  * @open: open callback
+  * @opened: state of open
+  *
+  * This can be used to finish opening a file passed to i_op->atomic_open().
+  *
+  * If the open callback is set to NULL, then the standard f_op->open()
+  * filesystem callback is substituted.
+  *
+  * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
+  * the return value of d_splice_alias(), then the caller needs to perform dput()
+  * on it after finish_open().
+  *
+  * Returns zero on success or -errno if the open failed.
+  */
+ int finish_open(struct file *file, struct dentry *dentry,
+ 		int (*open)(struct inode *, struct file *))
+ {
+ 	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
+ 
+ 	file->f_path.dentry = dentry;
+ 	return do_dentry_open(file, d_backing_inode(dentry), open);
+ }
+ EXPORT_SYMBOL(finish_open);
+ 
+ /**
+  * finish_no_open - finish ->atomic_open() without opening the file
+  *
+  * @file: file pointer
+  * @dentry: dentry or NULL (as returned from ->lookup())
+  *
+  * This can be used to set the result of a successful lookup in ->atomic_open().
+  *
+  * NB: unlike finish_open() this function does consume the dentry reference and
+  * the caller need not dput() it.
+  *
+  * Returns "0" which must be the return value of ->atomic_open() after having
+  * called this function.
+  */
+ int finish_no_open(struct file *file, struct dentry *dentry)
+ {
+ 	file->f_path.dentry = dentry;
+ 	return 0;
+ }
+ EXPORT_SYMBOL(finish_no_open);
+ 
+ char *file_path(struct file *filp, char *buf, int buflen)
+ {
+ 	return d_path(&filp->f_path, buf, buflen);
+ }
+ EXPORT_SYMBOL(file_path);
+ 
+ /**
+  * vfs_open - open the file at the given path
+  * @path: path to open
+  * @file: newly allocated file with f_flag initialized
+  * @cred: credentials to use
+  */
+ int vfs_open(const struct path *path, struct file *file)
+ {
+ 	file->f_path = *path;
+ 	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
+ }
+ 
+ struct file *dentry_open(const struct path *path, int flags,
+ 			 const struct cred *cred)
+ {
+ 	int error;
+ 	struct file *f;
+ 
+ 	validate_creds(cred);
+ 
+ 	/* We must always pass in a valid mount pointer. */
+ 	BUG_ON(!path->mnt);
+ 
+ 	f = alloc_empty_file(flags, cred);
+ 	if (!IS_ERR(f)) {
+ 		error = vfs_open(path, f);
+ 		if (error) {
+ 			fput(f);
+ 			f = ERR_PTR(error);
+ 		}
+ 	}
+ 	return f;
+ }
+ EXPORT_SYMBOL(dentry_open);
+ 
+ struct file *open_with_fake_path(const struct path *path, int flags,
+ 				struct inode *inode, const struct cred *cred)
+ {
+ 	struct file *f = alloc_empty_file_noaccount(flags, cred);
+ 	if (!IS_ERR(f)) {
+ 		int error;
+ 
+ 		f->f_path = *path;
+ 		error = do_dentry_open(f, inode, NULL);
+ 		if (error) {
+ 			fput(f);
+ 			f = ERR_PTR(error);
+ 		}
+ 	}
+ 	return f;
+ }
+ EXPORT_SYMBOL(open_with_fake_path);
+ 
+ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
+ {
+ 	int lookup_flags = 0;
+ 	int acc_mode = ACC_MODE(flags);
+ 
+ 	/*
+ 	 * Clear out all open flags we don't know about so that we don't report
+ 	 * them in fcntl(F_GETFD) or similar interfaces.
+ 	 */
+ 	flags &= VALID_OPEN_FLAGS;
+ 
+ 	if (flags & (O_CREAT | __O_TMPFILE))
+ 		op->mode = (mode & S_IALLUGO) | S_IFREG;
+ 	else
+ 		op->mode = 0;
+ 
+ 	/* Must never be set by userspace */
+ 	flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
+ 
+ 	/*
+ 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+ 	 * check for O_DSYNC if the need any syncing at all we enforce it's
+ 	 * always set instead of having to deal with possibly weird behaviour
+ 	 * for malicious applications setting only __O_SYNC.
+ 	 */
+ 	if (flags & __O_SYNC)
+ 		flags |= O_DSYNC;
+ 
+ 	if (flags & __O_TMPFILE) {
+ 		if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
+ 			return -EINVAL;
+ 		if (!(acc_mode & MAY_WRITE))
+ 			return -EINVAL;
+ 	} else if (flags & O_PATH) {
+ 		/*
+ 		 * If we have O_PATH in the open flag. Then we
+ 		 * cannot have anything other than the below set of flags
+ 		 */
+ 		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+ 		acc_mode = 0;
+ 	}
+ 
+ 	op->open_flag = flags;
+ 
+ 	/* O_TRUNC implies we need access checks for write permissions */
+ 	if (flags & O_TRUNC)
+ 		acc_mode |= MAY_WRITE;
+ 
+ 	/* Allow the LSM permission hook to distinguish append
+ 	   access from general write access. */
+ 	if (flags & O_APPEND)
+ 		acc_mode |= MAY_APPEND;
+ 
+ 	op->acc_mode = acc_mode;
+ 
+ 	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+ 
+ 	if (flags & O_CREAT) {
+ 		op->intent |= LOOKUP_CREATE;
+ 		if (flags & O_EXCL)
+ 			op->intent |= LOOKUP_EXCL;
+ 	}
+ 
+ 	if (flags & O_DIRECTORY)
+ 		lookup_flags |= LOOKUP_DIRECTORY;
+ 	if (!(flags & O_NOFOLLOW))
+ 		lookup_flags |= LOOKUP_FOLLOW;
+ 	op->lookup_flags = lookup_flags;
+ 	return 0;
+ }
+ 
+ /**
+  * file_open_name - open file and return file pointer
+  *
+  * @name:	struct filename containing path to open
+  * @flags:	open flags as per the open(2) second argument
+  * @mode:	mode for the new file if O_CREAT is set, else ignored
+  *
+  * This is the helper to open a file from kernelspace if you really
+  * have to.  But in generally you should not do this, so please move
+  * along, nothing to see here..
+  */
+ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
+ {
+ 	struct open_flags op;
+ 	int err = build_open_flags(flags, mode, &op);
+ 	return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
+ }
+ 
+ /**
+  * filp_open - open file and return file pointer
+  *
+  * @filename:	path to open
+  * @flags:	open flags as per the open(2) second argument
+  * @mode:	mode for the new file if O_CREAT is set, else ignored
+  *
+  * This is the helper to open a file from kernelspace if you really
+  * have to.  But in generally you should not do this, so please move
+  * along, nothing to see here..
+  */
+ struct file *filp_open(const char *filename, int flags, umode_t mode)
+ {
+ 	struct filename *name = getname_kernel(filename);
+ 	struct file *file = ERR_CAST(name);
+ 	
+ 	if (!IS_ERR(name)) {
+ 		file = file_open_name(name, flags, mode);
+ 		putname(name);
+ 	}
+ 	return file;
+ }
+ EXPORT_SYMBOL(filp_open);
+ 
+ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+ 			    const char *filename, int flags, umode_t mode)
+ {
+ 	struct open_flags op;
+ 	int err = build_open_flags(flags, mode, &op);
+ 	if (err)
+ 		return ERR_PTR(err);
+ 	return do_file_open_root(dentry, mnt, filename, &op);
+ }
+ EXPORT_SYMBOL(file_open_root);
+ 
+ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+ {
+ 	struct open_flags op;
+ 	int fd = build_open_flags(flags, mode, &op);
+ 	struct filename *tmp;
+ 
+ 	if (fd)
+ 		return fd;
+ 
+ 	tmp = getname(filename);
+ 	if (IS_ERR(tmp))
+ 		return PTR_ERR(tmp);
+ 
+ 	fd = get_unused_fd_flags(flags);
+ 	if (fd >= 0) {
+ 		struct file *f = do_filp_open(dfd, tmp, &op);
+ 		if (IS_ERR(f)) {
+ 			put_unused_fd(fd);
+ 			fd = PTR_ERR(f);
+ 		} else {
+ 			fsnotify_open(f);
+ 			fd_install(fd, f);
+ 		}
+ 	}
+ 	putname(tmp);
+ 	return fd;
+ }
+ 
+ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+ {
+ 	if (force_o_largefile())
+ 		flags |= O_LARGEFILE;
+ 
+ 	return do_sys_open(AT_FDCWD, filename, flags, mode);
+ }
+ 
+ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
+ 		umode_t, mode)
+ {
+ 	if (force_o_largefile())
+ 		flags |= O_LARGEFILE;
+ 
+ 	return do_sys_open(dfd, filename, flags, mode);
+ }
+ 
+ #ifdef CONFIG_COMPAT
+ /*
+  * Exactly like sys_open(), except that it doesn't set the
+  * O_LARGEFILE flag.
+  */
+ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+ {
+ 	return do_sys_open(AT_FDCWD, filename, flags, mode);
+ }
+ 
+ /*
+  * Exactly like sys_openat(), except that it doesn't set the
+  * O_LARGEFILE flag.
+  */
+ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
+ {
+ 	return do_sys_open(dfd, filename, flags, mode);
+ }
+ #endif
+ 
+ #ifndef __alpha__
+ 
+ /*
+  * For backward compatibility?  Maybe this should be moved
+  * into arch/i386 instead?
+  */
+ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
+ {
+ 	return ksys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
+ }
+ 
+ #endif
+ 
+ /*
+  * "id" is the POSIX thread ID. We use the
+  * files pointer for this..
+  */
+ int filp_close(struct file *filp, fl_owner_t id)
+ {
+ 	int retval = 0;
+ 
+ 	if (!file_count(filp)) {
+ 		printk(KERN_ERR "VFS: Close: file count is 0\n");
+ 		return 0;
+ 	}
+ 
+ 	if (filp->f_op->flush)
+ 		retval = filp->f_op->flush(filp, id);
+ 
+ 	if (likely(!(filp->f_mode & FMODE_PATH))) {
+ 		dnotify_flush(filp, id);
+ 		locks_remove_posix(filp, id);
+ 	}
+ 	fput(filp);
+ 	return retval;
+ }
+ 
+ EXPORT_SYMBOL(filp_close);
+ 
+ /*
+  * Careful here! We test whether the file pointer is NULL before
+  * releasing the fd. This ensures that one clone task can't release
+  * an fd while another clone is opening it.
+  */
+ SYSCALL_DEFINE1(close, unsigned int, fd)
+ {
+ 	int retval = __close_fd(current->files, fd);
+ 
+ 	/* can't restart close syscall because file table entry was cleared */
+ 	if (unlikely(retval == -ERESTARTSYS ||
+ 		     retval == -ERESTARTNOINTR ||
+ 		     retval == -ERESTARTNOHAND ||
+ 		     retval == -ERESTART_RESTARTBLOCK))
+ 		retval = -EINTR;
+ 
+ 	return retval;
+ }
+ 
+ /*
+  * This routine simulates a hangup on the tty, to arrange that users
+  * are given clean terminals at login time.
+  */
+ SYSCALL_DEFINE0(vhangup)
+ {
+ 	if (capable(CAP_SYS_TTY_CONFIG)) {
+ 		tty_vhangup_self();
+ 		return 0;
+ 	}
+ 	return -EPERM;
+ }
+ 
+ /*
+  * Called when an inode is about to be open.
+  * We use this to disallow opening large files on 32bit systems if
+  * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
+  * on this flag in sys_open.
+  */
+ int generic_file_open(struct inode * inode, struct file * filp)
+ {
+ 	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+ 		return -EOVERFLOW;
+ 	return 0;
+ }
+ 
+ EXPORT_SYMBOL(generic_file_open);
+ 
+ /*
+  * This is used by subsystems that don't want seekable
+  * file descriptors. The function is not supposed to ever fail, the only
+  * reason it returns an 'int' and not 'void' is so that it can be plugged
+  * directly into file_operations structure.
+  */
+ int nonseekable_open(struct inode *inode, struct file *filp)
+ {
+ 	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+ 	return 0;
+ }
+ 
+ EXPORT_SYMBOL(nonseekable_open);
+ 
+ /*
+  * stream_open is used by subsystems that want stream-like file descriptors.
+  * Such file descriptors are not seekable and don't have notion of position
+  * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
+  * Contrary to file descriptors of other regular files, .read() and .write()
+  * can run simultaneously.
+  *
+  * stream_open never fails and is marked to return int so that it could be
+  * directly used as file_operations.open .
+  */
+ int stream_open(struct inode *inode, struct file *filp)
+ {
+ 	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
+ 	filp->f_mode |= FMODE_STREAM;
+ 	return 0;
+ }
+ 
+ EXPORT_SYMBOL(stream_open);
diff --color -rcNP Master/fs/open.c.rej OG/fs/open.c.rej
*** Master/fs/open.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/open.c.rej	2021-04-20 15:11:27.316000000 -0400
***************
*** 0 ****
--- 1,19 ----
+ *** fs/open.c	2021-03-13 14:18:55.000000000 +0200
+ --- fs/open.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 1056,1062 ****
+   {
+   	struct filename *name = getname_kernel(filename);
+   	struct file *file = ERR_CAST(name);
+ ! 
+   	if (!IS_ERR(name)) {
+   		file = file_open_name(name, flags, mode);
+   		putname(name);
+ --- 1044,1050 ----
+   {
+   	struct filename *name = getname_kernel(filename);
+   	struct file *file = ERR_CAST(name);
+ ! 
+   	if (!IS_ERR(name)) {
+   		file = file_open_name(name, flags, mode);
+   		putname(name);
diff --color -rcNP Master/fs/proc/array.c OG/fs/proc/array.c
*** Master/fs/proc/array.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/array.c	2021-04-20 15:11:34.511000000 -0400
***************
*** 401,406 ****
--- 401,421 ----
  	seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
  }
  
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ static inline void task_pax(struct seq_file *m, struct task_struct *p)
+ {
+ 	if (p->mm)
+ 		seq_printf(m, "PaX:\t%c%c%c%c%c\n",
+ 			   p->mm->pax_flags & MF_PAX_PAGEEXEC ? 'P' : 'p',
+ 			   p->mm->pax_flags & MF_PAX_EMUTRAMP ? 'E' : 'e',
+ 			   p->mm->pax_flags & MF_PAX_MPROTECT ? 'M' : 'm',
+ 			   p->mm->pax_flags & MF_PAX_RANDMMAP ? 'R' : 'r',
+ 			   p->mm->pax_flags & MF_PAX_SEGMEXEC ? 'S' : 's');
+ 	else
+ 		seq_printf(m, "PaX:\t-----\n");
+ }
+ #endif
+ 
  int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
  			struct pid *pid, struct task_struct *task)
  {
***************
*** 424,429 ****
--- 439,449 ----
  	task_cpus_allowed(m, task);
  	cpuset_task_status_allowed(m, task);
  	task_context_switch_counts(m, task);
+ 
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ 	task_pax(m, task);
+ #endif
+ 
  	return 0;
  }
  
diff --color -rcNP Master/fs/proc/base.c OG/fs/proc/base.c
*** Master/fs/proc/base.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/base.c	2021-04-20 15:14:26.983000000 -0400
***************
*** 94,99 ****
--- 94,100 ----
  #include <linux/sched/debug.h>
  #include <linux/sched/stat.h>
  #include <linux/posix-timers.h>
+ #include <linux/minisec.h>
  #include <trace/events/oom.h>
  #include "internal.h"
  #include "fd.h"
***************
*** 699,704 ****
--- 700,729 ----
  				 struct task_struct *task,
  				 int hide_pid_min)
  {
+ 	if (gr_pid_is_chrooted(task))
+ 	return false;
+ 
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+ rcu_read_lock();
+ {
+ 	const struct cred *tmpcred = current_cred();
+ 	const struct cred *cred = __task_cred(task);
+ 
+ 	if (uid_eq(tmpcred->uid, GLOBAL_ROOT_UID) || uid_eq(tmpcred->uid, cred->uid)
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ 		|| in_group_p(grsec_proc_gid)
+ #endif
+ 	) {
+ 		rcu_read_unlock();
+ 		return true;
+ 	}
+ }
+ rcu_read_unlock();
+ 
+ if (pid->hide_pid)
+ 	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
+ #endif
+ 
  	if (pid->hide_pid < hide_pid_min)
  		return true;
  	if (in_group_p(pid->pid_gid))
***************
*** 719,725 ****
--- 744,754 ----
  	has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
  	put_task_struct(task);
  
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 		{
+ #else
  	if (!has_perms) {
+ #endif
  		if (pid->hide_pid == HIDEPID_INVISIBLE) {
  			/*
  			 * Let's make getdents(), stat(), and open()
***************
*** 1704,1710 ****
--- 1733,1743 ----
  	rcu_read_lock();
  	cred = __task_cred(task);
  	uid = cred->euid;
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ 	gid = grsec_proc_gid;
+ #else
  	gid = cred->egid;
+ #endif
  	rcu_read_unlock();
  
  	/*
***************
*** 2472,2478 ****
  	return d_splice_alias(inode, dentry);
  }
  
! static struct dentry *proc_pident_lookup(struct inode *dir, 
  					 struct dentry *dentry,
  					 const struct pid_entry *p,
  					 const struct pid_entry *end)
--- 2505,2511 ----
  	return d_splice_alias(inode, dentry);
  }
  
! static struct dentry *proc_pident_lookup(struct inode *dir,
  					 struct dentry *dentry,
  					 const struct pid_entry *p,
  					 const struct pid_entry *end)
***************
*** 2483,2488 ****
--- 2516,2525 ----
  	if (!task)
  		goto out_no_task;
  
+ 	if (gr_pid_is_chrooted(task))
+ 		goto out_no_task;
+ 
+ 
  	/*
  	 * Yes, it does not scale. And it should not. Don't add
  	 * new entries into /proc/<tgid>/ without very good reasons.
***************
*** 2509,2514 ****
--- 2546,2553 ----
  	if (!task)
  		return -ENOENT;
  
+ 	if (gr_pid_is_chrooted(task))
+ 		goto out;
  	if (!dir_emit_dots(file, ctx))
  		goto out;
  
***************
*** 2659,2665 ****
  
  static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
  {
! 	return proc_pident_readdir(file, ctx, 
  				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
  }
  
--- 2698,2704 ----
  
  static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
  {
! 	return proc_pident_readdir(file, ctx,
  				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
  }
  
***************
*** 3223,3229 ****
--- 3262,3274 ----
  {
  	struct inode *inode;
  
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IXUSR);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IRGRP | S_IXUSR | S_IXGRP);
+ #else
  	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ #endif
  	if (!inode)
  		return ERR_PTR(-ENOENT);
  
***************
*** 3517,3523 ****
--- 3562,3575 ----
  	struct task_struct *task, const void *ptr)
  {
  	struct inode *inode;
+ 
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IXUSR);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IRGRP | S_IXUSR | S_IXGRP);
+ #else
  	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ #endif
  	if (!inode)
  		return ERR_PTR(-ENOENT);
  
diff --color -rcNP Master/fs/proc/base.c.orig OG/fs/proc/base.c.orig
*** Master/fs/proc/base.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/proc/base.c.orig	2021-04-20 15:10:45.382000000 -0400
***************
*** 0 ****
--- 1,3771 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  *  linux/fs/proc/base.c
+  *
+  *  Copyright (C) 1991, 1992 Linus Torvalds
+  *
+  *  proc base directory handling functions
+  *
+  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
+  *  Instead of using magical inumbers to determine the kind of object
+  *  we allocate and fill in-core inodes upon lookup. They don't even
+  *  go into icache. We cache the reference to task_struct upon lookup too.
+  *  Eventually it should become a filesystem in its own. We don't use the
+  *  rest of procfs anymore.
+  *
+  *
+  *  Changelog:
+  *  17-Jan-2005
+  *  Allan Bezerra
+  *  Bruna Moreira <bruna.moreira@indt.org.br>
+  *  Edjard Mota <edjard.mota@indt.org.br>
+  *  Ilias Biris <ilias.biris@indt.org.br>
+  *  Mauricio Lin <mauricio.lin@indt.org.br>
+  *
+  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+  *
+  *  A new process specific entry (smaps) included in /proc. It shows the
+  *  size of rss for each memory area. The maps entry lacks information
+  *  about physical memory size (rss) for each mapped file, i.e.,
+  *  rss information for executables and library files.
+  *  This additional information is useful for any tools that need to know
+  *  about physical memory consumption for a process specific library.
+  *
+  *  Changelog:
+  *  21-Feb-2005
+  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+  *  Pud inclusion in the page table walking.
+  *
+  *  ChangeLog:
+  *  10-Mar-2005
+  *  10LE Instituto Nokia de Tecnologia - INdT:
+  *  A better way to walks through the page table as suggested by Hugh Dickins.
+  *
+  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
+  *  Smaps information related to shared, private, clean and dirty pages.
+  *
+  *  Paul Mundt <paul.mundt@nokia.com>:
+  *  Overall revision about smaps.
+  */
+ 
+ #include <linux/uaccess.h>
+ 
+ #include <linux/errno.h>
+ #include <linux/time.h>
+ #include <linux/proc_fs.h>
+ #include <linux/stat.h>
+ #include <linux/task_io_accounting_ops.h>
+ #include <linux/init.h>
+ #include <linux/capability.h>
+ #include <linux/file.h>
+ #include <linux/fdtable.h>
+ #include <linux/generic-radix-tree.h>
+ #include <linux/string.h>
+ #include <linux/seq_file.h>
+ #include <linux/namei.h>
+ #include <linux/mnt_namespace.h>
+ #include <linux/mm.h>
+ #include <linux/swap.h>
+ #include <linux/rcupdate.h>
+ #include <linux/kallsyms.h>
+ #include <linux/stacktrace.h>
+ #include <linux/resource.h>
+ #include <linux/module.h>
+ #include <linux/mount.h>
+ #include <linux/security.h>
+ #include <linux/ptrace.h>
+ #include <linux/tracehook.h>
+ #include <linux/printk.h>
+ #include <linux/cache.h>
+ #include <linux/cgroup.h>
+ #include <linux/cpuset.h>
+ #include <linux/audit.h>
+ #include <linux/poll.h>
+ #include <linux/nsproxy.h>
+ #include <linux/oom.h>
+ #include <linux/elf.h>
+ #include <linux/pid_namespace.h>
+ #include <linux/user_namespace.h>
+ #include <linux/fs_struct.h>
+ #include <linux/slab.h>
+ #include <linux/sched/autogroup.h>
+ #include <linux/sched/mm.h>
+ #include <linux/sched/coredump.h>
+ #include <linux/sched/debug.h>
+ #include <linux/sched/stat.h>
+ #include <linux/posix-timers.h>
+ #include <linux/minisec.h>
+ #include <trace/events/oom.h>
+ #include "internal.h"
+ #include "fd.h"
+ 
+ #include "../../lib/kstrtox.h"
+ 
+ /* NOTE:
+  *	Implementing inode permission operations in /proc is almost
+  *	certainly an error.  Permission checks need to happen during
+  *	each system call not at open time.  The reason is that most of
+  *	what we wish to check for permissions in /proc varies at runtime.
+  *
+  *	The classic example of a problem is opening file descriptors
+  *	in /proc for a task before it execs a suid executable.
+  */
+ 
+ static u8 nlink_tid __ro_after_init;
+ static u8 nlink_tgid __ro_after_init;
+ 
+ struct pid_entry {
+ 	const char *name;
+ 	unsigned int len;
+ 	umode_t mode;
+ 	const struct inode_operations *iop;
+ 	const struct file_operations *fop;
+ 	union proc_op op;
+ };
+ 
+ #define NOD(NAME, MODE, IOP, FOP, OP) {			\
+ 	.name = (NAME),					\
+ 	.len  = sizeof(NAME) - 1,			\
+ 	.mode = MODE,					\
+ 	.iop  = IOP,					\
+ 	.fop  = FOP,					\
+ 	.op   = OP,					\
+ }
+ 
+ #define DIR(NAME, MODE, iops, fops)	\
+ 	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+ #define LNK(NAME, get_link)					\
+ 	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
+ 		&proc_pid_link_inode_operations, NULL,		\
+ 		{ .proc_get_link = get_link } )
+ #define REG(NAME, MODE, fops)				\
+ 	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+ #define ONE(NAME, MODE, show)				\
+ 	NOD(NAME, (S_IFREG|(MODE)),			\
+ 		NULL, &proc_single_file_operations,	\
+ 		{ .proc_show = show } )
+ #define ATTR(LSM, NAME, MODE)				\
+ 	NOD(NAME, (S_IFREG|(MODE)),			\
+ 		NULL, &proc_pid_attr_operations,	\
+ 		{ .lsm = LSM })
+ 
+ /*
+  * Count the number of hardlinks for the pid_entry table, excluding the .
+  * and .. links.
+  */
+ static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
+ 	unsigned int n)
+ {
+ 	unsigned int i;
+ 	unsigned int count;
+ 
+ 	count = 2;
+ 	for (i = 0; i < n; ++i) {
+ 		if (S_ISDIR(entries[i].mode))
+ 			++count;
+ 	}
+ 
+ 	return count;
+ }
+ 
+ static int get_task_root(struct task_struct *task, struct path *root)
+ {
+ 	int result = -ENOENT;
+ 
+ 	task_lock(task);
+ 	if (task->fs) {
+ 		get_fs_root(task->fs, root);
+ 		result = 0;
+ 	}
+ 	task_unlock(task);
+ 	return result;
+ }
+ 
+ static int proc_cwd_link(struct dentry *dentry, struct path *path)
+ {
+ 	struct task_struct *task = get_proc_task(d_inode(dentry));
+ 	int result = -ENOENT;
+ 
+ 	if (task) {
+ 		task_lock(task);
+ 		if (task->fs) {
+ 			get_fs_pwd(task->fs, path);
+ 			result = 0;
+ 		}
+ 		task_unlock(task);
+ 		put_task_struct(task);
+ 	}
+ 	return result;
+ }
+ 
+ static int proc_root_link(struct dentry *dentry, struct path *path)
+ {
+ 	struct task_struct *task = get_proc_task(d_inode(dentry));
+ 	int result = -ENOENT;
+ 
+ 	if (task) {
+ 		result = get_task_root(task, path);
+ 		put_task_struct(task);
+ 	}
+ 	return result;
+ }
+ 
+ /*
+  * If the user used setproctitle(), we just get the string from
+  * user space at arg_start, and limit it to a maximum of one page.
+  */
+ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
+ 				size_t count, unsigned long pos,
+ 				unsigned long arg_start)
+ {
+ 	char *page;
+ 	int ret, got;
+ 
+ 	if (pos >= PAGE_SIZE)
+ 		return 0;
+ 
+ 	page = (char *)__get_free_page(GFP_KERNEL);
+ 	if (!page)
+ 		return -ENOMEM;
+ 
+ 	ret = 0;
+ 	got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
+ 	if (got > 0) {
+ 		int len = strnlen(page, got);
+ 
+ 		/* Include the NUL character if it was found */
+ 		if (len < got)
+ 			len++;
+ 
+ 		if (len > pos) {
+ 			len -= pos;
+ 			if (len > count)
+ 				len = count;
+ 			len -= copy_to_user(buf, page+pos, len);
+ 			if (!len)
+ 				len = -EFAULT;
+ 			ret = len;
+ 		}
+ 	}
+ 	free_page((unsigned long)page);
+ 	return ret;
+ }
+ 
+ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ 			      size_t count, loff_t *ppos)
+ {
+ 	unsigned long arg_start, arg_end, env_start, env_end;
+ 	unsigned long pos, len;
+ 	char *page, c;
+ 
+ 	/* Check if process spawned far enough to have cmdline. */
+ 	if (!mm->env_end)
+ 		return 0;
+ 
+ 	spin_lock(&mm->arg_lock);
+ 	arg_start = mm->arg_start;
+ 	arg_end = mm->arg_end;
+ 	env_start = mm->env_start;
+ 	env_end = mm->env_end;
+ 	spin_unlock(&mm->arg_lock);
+ 
+ 	if (arg_start >= arg_end)
+ 		return 0;
+ 
+ 	/*
+ 	 * We allow setproctitle() to overwrite the argument
+ 	 * strings, and overflow past the original end. But
+ 	 * only when it overflows into the environment area.
+ 	 */
+ 	if (env_start != arg_end || env_end < env_start)
+ 		env_start = env_end = arg_end;
+ 	len = env_end - arg_start;
+ 
+ 	/* We're not going to care if "*ppos" has high bits set */
+ 	pos = *ppos;
+ 	if (pos >= len)
+ 		return 0;
+ 	if (count > len - pos)
+ 		count = len - pos;
+ 	if (!count)
+ 		return 0;
+ 
+ 	/*
+ 	 * Magical special case: if the argv[] end byte is not
+ 	 * zero, the user has overwritten it with setproctitle(3).
+ 	 *
+ 	 * Possible future enhancement: do this only once when
+ 	 * pos is 0, and set a flag in the 'struct file'.
+ 	 */
+ 	if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
+ 		return get_mm_proctitle(mm, buf, count, pos, arg_start);
+ 
+ 	/*
+ 	 * For the non-setproctitle() case we limit things strictly
+ 	 * to the [arg_start, arg_end[ range.
+ 	 */
+ 	pos += arg_start;
+ 	if (pos < arg_start || pos >= arg_end)
+ 		return 0;
+ 	if (count > arg_end - pos)
+ 		count = arg_end - pos;
+ 
+ 	page = (char *)__get_free_page(GFP_KERNEL);
+ 	if (!page)
+ 		return -ENOMEM;
+ 
+ 	len = 0;
+ 	while (count) {
+ 		int got;
+ 		size_t size = min_t(size_t, PAGE_SIZE, count);
+ 
+ 		got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
+ 		if (got <= 0)
+ 			break;
+ 		got -= copy_to_user(buf, page, got);
+ 		if (unlikely(!got)) {
+ 			if (!len)
+ 				len = -EFAULT;
+ 			break;
+ 		}
+ 		pos += got;
+ 		buf += got;
+ 		len += got;
+ 		count -= got;
+ 	}
+ 
+ 	free_page((unsigned long)page);
+ 	return len;
+ }
+ 
+ static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
+ 				size_t count, loff_t *pos)
+ {
+ 	struct mm_struct *mm;
+ 	ssize_t ret;
+ 
+ 	mm = get_task_mm(tsk);
+ 	if (!mm)
+ 		return 0;
+ 
+ 	ret = get_mm_cmdline(mm, buf, count, pos);
+ 	mmput(mm);
+ 	return ret;
+ }
+ 
+ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ 				     size_t count, loff_t *pos)
+ {
+ 	struct task_struct *tsk;
+ 	ssize_t ret;
+ 
+ 	BUG_ON(*pos < 0);
+ 
+ 	tsk = get_proc_task(file_inode(file));
+ 	if (!tsk)
+ 		return -ESRCH;
+ 	ret = get_task_cmdline(tsk, buf, count, pos);
+ 	put_task_struct(tsk);
+ 	if (ret > 0)
+ 		*pos += ret;
+ 	return ret;
+ }
+ 
+ static const struct file_operations proc_pid_cmdline_ops = {
+ 	.read	= proc_pid_cmdline_read,
+ 	.llseek	= generic_file_llseek,
+ };
+ 
+ #ifdef CONFIG_KALLSYMS
+ /*
+  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
+  * Returns the resolved symbol.  If that fails, simply return the address.
+  */
+ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
+ {
+ 	unsigned long wchan;
+ 	char symname[KSYM_NAME_LEN];
+ 
+ 	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ 		goto print0;
+ 
+ 	wchan = get_wchan(task);
+ 	if (wchan && !lookup_symbol_name(wchan, symname)) {
+ 		seq_puts(m, symname);
+ 		return 0;
+ 	}
+ 
+ print0:
+ 	seq_putc(m, '0');
+ 	return 0;
+ }
+ #endif /* CONFIG_KALLSYMS */
+ 
+ static int lock_trace(struct task_struct *task)
+ {
+ 	int err = down_read_killable(&task->signal->exec_update_lock);
+ 	if (err)
+ 		return err;
+ 	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
+ 		up_read(&task->signal->exec_update_lock);
+ 		return -EPERM;
+ 	}
+ 	return 0;
+ }
+ 
+ static void unlock_trace(struct task_struct *task)
+ {
+ 	up_read(&task->signal->exec_update_lock);
+ }
+ 
+ #ifdef CONFIG_STACKTRACE
+ 
+ #define MAX_STACK_TRACE_DEPTH	64
+ 
+ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
+ {
+ 	unsigned long *entries;
+ 	int err;
+ 
+ 	/*
+ 	 * The ability to racily run the kernel stack unwinder on a running task
+ 	 * and then observe the unwinder output is scary; while it is useful for
+ 	 * debugging kernel issues, it can also allow an attacker to leak kernel
+ 	 * stack contents.
+ 	 * Doing this in a manner that is at least safe from races would require
+ 	 * some work to ensure that the remote task can not be scheduled; and
+ 	 * even then, this would still expose the unwinder as local attack
+ 	 * surface.
+ 	 * Therefore, this interface is restricted to root.
+ 	 */
+ 	if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ 		return -EACCES;
+ 
+ 	entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
+ 				GFP_KERNEL);
+ 	if (!entries)
+ 		return -ENOMEM;
+ 
+ 	err = lock_trace(task);
+ 	if (!err) {
+ 		unsigned int i, nr_entries;
+ 
+ 		nr_entries = stack_trace_save_tsk(task, entries,
+ 						  MAX_STACK_TRACE_DEPTH, 0);
+ 
+ 		for (i = 0; i < nr_entries; i++) {
+ 			seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
+ 		}
+ 
+ 		unlock_trace(task);
+ 	}
+ 	kfree(entries);
+ 
+ 	return err;
+ }
+ #endif
+ 
+ #ifdef CONFIG_SCHED_INFO
+ /*
+  * Provides /proc/PID/schedstat
+  */
+ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+ 			      struct pid *pid, struct task_struct *task)
+ {
+ 	if (unlikely(!sched_info_on()))
+ 		seq_puts(m, "0 0 0\n");
+ 	else
+ 		seq_printf(m, "%llu %llu %lu\n",
+ 		   (unsigned long long)task->se.sum_exec_runtime,
+ 		   (unsigned long long)task->sched_info.run_delay,
+ 		   task->sched_info.pcount);
+ 
+ 	return 0;
+ }
+ #endif
+ 
+ #ifdef CONFIG_LATENCYTOP
+ static int lstats_show_proc(struct seq_file *m, void *v)
+ {
+ 	int i;
+ 	struct inode *inode = m->private;
+ 	struct task_struct *task = get_proc_task(inode);
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	seq_puts(m, "Latency Top version : v0.1\n");
+ 	for (i = 0; i < LT_SAVECOUNT; i++) {
+ 		struct latency_record *lr = &task->latency_record[i];
+ 		if (lr->backtrace[0]) {
+ 			int q;
+ 			seq_printf(m, "%i %li %li",
+ 				   lr->count, lr->time, lr->max);
+ 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+ 				unsigned long bt = lr->backtrace[q];
+ 
+ 				if (!bt)
+ 					break;
+ 				seq_printf(m, " %ps", (void *)bt);
+ 			}
+ 			seq_putc(m, '\n');
+ 		}
+ 
+ 	}
+ 	put_task_struct(task);
+ 	return 0;
+ }
+ 
+ static int lstats_open(struct inode *inode, struct file *file)
+ {
+ 	return single_open(file, lstats_show_proc, inode);
+ }
+ 
+ static ssize_t lstats_write(struct file *file, const char __user *buf,
+ 			    size_t count, loff_t *offs)
+ {
+ 	struct task_struct *task = get_proc_task(file_inode(file));
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	clear_tsk_latency_tracing(task);
+ 	put_task_struct(task);
+ 
+ 	return count;
+ }
+ 
+ static const struct file_operations proc_lstats_operations = {
+ 	.open		= lstats_open,
+ 	.read		= seq_read,
+ 	.write		= lstats_write,
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+ 
+ #endif
+ 
+ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+ 			  struct pid *pid, struct task_struct *task)
+ {
+ 	unsigned long totalpages = totalram_pages() + total_swap_pages;
+ 	unsigned long points = 0;
+ 
+ 	points = oom_badness(task, totalpages) * 1000 / totalpages;
+ 	seq_printf(m, "%lu\n", points);
+ 
+ 	return 0;
+ }
+ 
+ struct limit_names {
+ 	const char *name;
+ 	const char *unit;
+ };
+ 
+ static const struct limit_names lnames[RLIM_NLIMITS] = {
+ 	[RLIMIT_CPU] = {"Max cpu time", "seconds"},
+ 	[RLIMIT_FSIZE] = {"Max file size", "bytes"},
+ 	[RLIMIT_DATA] = {"Max data size", "bytes"},
+ 	[RLIMIT_STACK] = {"Max stack size", "bytes"},
+ 	[RLIMIT_CORE] = {"Max core file size", "bytes"},
+ 	[RLIMIT_RSS] = {"Max resident set", "bytes"},
+ 	[RLIMIT_NPROC] = {"Max processes", "processes"},
+ 	[RLIMIT_NOFILE] = {"Max open files", "files"},
+ 	[RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
+ 	[RLIMIT_AS] = {"Max address space", "bytes"},
+ 	[RLIMIT_LOCKS] = {"Max file locks", "locks"},
+ 	[RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
+ 	[RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
+ 	[RLIMIT_NICE] = {"Max nice priority", NULL},
+ 	[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
+ 	[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+ };
+ 
+ /* Display limits for a process */
+ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+ 			   struct pid *pid, struct task_struct *task)
+ {
+ 	unsigned int i;
+ 	unsigned long flags;
+ 
+ 	struct rlimit rlim[RLIM_NLIMITS];
+ 
+ 	if (!lock_task_sighand(task, &flags))
+ 		return 0;
+ 	memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
+ 	unlock_task_sighand(task, &flags);
+ 
+ 	/*
+ 	 * print the file header
+ 	 */
+ 	seq_puts(m, "Limit                     "
+ 		"Soft Limit           "
+ 		"Hard Limit           "
+ 		"Units     \n");
+ 
+ 	for (i = 0; i < RLIM_NLIMITS; i++) {
+ 		if (rlim[i].rlim_cur == RLIM_INFINITY)
+ 			seq_printf(m, "%-25s %-20s ",
+ 				   lnames[i].name, "unlimited");
+ 		else
+ 			seq_printf(m, "%-25s %-20lu ",
+ 				   lnames[i].name, rlim[i].rlim_cur);
+ 
+ 		if (rlim[i].rlim_max == RLIM_INFINITY)
+ 			seq_printf(m, "%-20s ", "unlimited");
+ 		else
+ 			seq_printf(m, "%-20lu ", rlim[i].rlim_max);
+ 
+ 		if (lnames[i].unit)
+ 			seq_printf(m, "%-10s\n", lnames[i].unit);
+ 		else
+ 			seq_putc(m, '\n');
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+ 			    struct pid *pid, struct task_struct *task)
+ {
+ 	struct syscall_info info;
+ 	u64 *args = &info.data.args[0];
+ 	int res;
+ 
+ 	res = lock_trace(task);
+ 	if (res)
+ 		return res;
+ 
+ 	if (task_current_syscall(task, &info))
+ 		seq_puts(m, "running\n");
+ 	else if (info.data.nr < 0)
+ 		seq_printf(m, "%d 0x%llx 0x%llx\n",
+ 			   info.data.nr, info.sp, info.data.instruction_pointer);
+ 	else
+ 		seq_printf(m,
+ 		       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
+ 		       info.data.nr,
+ 		       args[0], args[1], args[2], args[3], args[4], args[5],
+ 		       info.sp, info.data.instruction_pointer);
+ 	unlock_trace(task);
+ 
+ 	return 0;
+ }
+ #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+ 
+ /************************************************************************/
+ /*                       Here the fs part begins                        */
+ /************************************************************************/
+ 
+ /* permission checks */
+ static int proc_fd_access_allowed(struct inode *inode)
+ {
+ 	struct task_struct *task;
+ 	int allowed = 0;
+ 	/* Allow access to a task's file descriptors if it is us or we
+ 	 * may use ptrace attach to the process and find out that
+ 	 * information.
+ 	 */
+ 	task = get_proc_task(inode);
+ 	if (task) {
+ 		allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ 		put_task_struct(task);
+ 	}
+ 	return allowed;
+ }
+ 
+ int proc_setattr(struct dentry *dentry, struct iattr *attr)
+ {
+ 	int error;
+ 	struct inode *inode = d_inode(dentry);
+ 
+ 	if (attr->ia_valid & ATTR_MODE)
+ 		return -EPERM;
+ 
+ 	error = setattr_prepare(dentry, attr);
+ 	if (error)
+ 		return error;
+ 
+ 	setattr_copy(inode, attr);
+ 	mark_inode_dirty(inode);
+ 	return 0;
+ }
+ 
+ /*
+  * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+  * or euid/egid (for hide_pid_min=2)?
+  */
+ static bool has_pid_permissions(struct pid_namespace *pid,
+ 				 struct task_struct *task,
+ 				 int hide_pid_min)
+ {
+ 	if (gr_pid_is_chrooted(task))
+ 	return false;
+ 
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+ rcu_read_lock();
+ {
+ 	const struct cred *tmpcred = current_cred();
+ 	const struct cred *cred = __task_cred(task);
+ 
+ 	if (uid_eq(tmpcred->uid, GLOBAL_ROOT_UID) || uid_eq(tmpcred->uid, cred->uid)
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ 		|| in_group_p(grsec_proc_gid)
+ #endif
+ 	) {
+ 		rcu_read_unlock();
+ 		return true;
+ 	}
+ }
+ rcu_read_unlock();
+ 
+ if (pid->hide_pid)
+ 	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
+ #endif
+ 
+ 	if (pid->hide_pid < hide_pid_min)
+ 		return true;
+ 	if (in_group_p(pid->pid_gid))
+ 		return true;
+ 	return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ }
+ 
+ 
+ static int proc_pid_permission(struct inode *inode, int mask)
+ {
+ 	struct pid_namespace *pid = proc_pid_ns(inode);
+ 	struct task_struct *task;
+ 	bool has_perms;
+ 
+ 	task = get_proc_task(inode);
+ 	if (!task)
+ 		return -ESRCH;
+ 	has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
+ 	put_task_struct(task);
+ 
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 		{
+ #else
+ 	if (!has_perms) {
+ #endif
+ 		if (pid->hide_pid == HIDEPID_INVISIBLE) {
+ 			/*
+ 			 * Let's make getdents(), stat(), and open()
+ 			 * consistent with each other.  If a process
+ 			 * may not stat() a file, it shouldn't be seen
+ 			 * in procfs at all.
+ 			 */
+ 			return -ENOENT;
+ 		}
+ 
+ 		return -EPERM;
+ 	}
+ 	return generic_permission(inode, mask);
+ }
+ 
+ 
+ 
+ static const struct inode_operations proc_def_inode_operations = {
+ 	.setattr	= proc_setattr,
+ };
+ 
+ static int proc_single_show(struct seq_file *m, void *v)
+ {
+ 	struct inode *inode = m->private;
+ 	struct pid_namespace *ns = proc_pid_ns(inode);
+ 	struct pid *pid = proc_pid(inode);
+ 	struct task_struct *task;
+ 	int ret;
+ 
+ 	task = get_pid_task(pid, PIDTYPE_PID);
+ 	if (!task)
+ 		return -ESRCH;
+ 
+ 	ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
+ 
+ 	put_task_struct(task);
+ 	return ret;
+ }
+ 
+ static int proc_single_open(struct inode *inode, struct file *filp)
+ {
+ 	return single_open(filp, proc_single_show, inode);
+ }
+ 
+ static const struct file_operations proc_single_file_operations = {
+ 	.open		= proc_single_open,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+ 
+ 
+ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
+ {
+ 	struct task_struct *task = get_proc_task(inode);
+ 	struct mm_struct *mm = ERR_PTR(-ESRCH);
+ 
+ 	if (task) {
+ 		mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
+ 		put_task_struct(task);
+ 
+ 		if (!IS_ERR_OR_NULL(mm)) {
+ 			/* ensure this mm_struct can't be freed */
+ 			mmgrab(mm);
+ 			/* but do not pin its memory */
+ 			mmput(mm);
+ 		}
+ 	}
+ 
+ 	return mm;
+ }
+ 
+ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
+ {
+ 	struct mm_struct *mm = proc_mem_open(inode, mode);
+ 
+ 	if (IS_ERR(mm))
+ 		return PTR_ERR(mm);
+ 
+ 	file->private_data = mm;
+ 	return 0;
+ }
+ 
+ static int mem_open(struct inode *inode, struct file *file)
+ {
+ 	int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
+ 
+ 	/* OK to pass negative loff_t, we can catch out-of-range */
+ 	file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ 
+ 	return ret;
+ }
+ 
+ static ssize_t mem_rw(struct file *file, char __user *buf,
+ 			size_t count, loff_t *ppos, int write)
+ {
+ 	struct mm_struct *mm = file->private_data;
+ 	unsigned long addr = *ppos;
+ 	ssize_t copied;
+ 	char *page;
+ 	unsigned int flags;
+ 
+ 	if (!mm)
+ 		return 0;
+ 
+ 	page = (char *)__get_free_page(GFP_KERNEL);
+ 	if (!page)
+ 		return -ENOMEM;
+ 
+ 	copied = 0;
+ 	if (!mmget_not_zero(mm))
+ 		goto free;
+ 
+ 	flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ 
+ 	while (count > 0) {
+ 		int this_len = min_t(int, count, PAGE_SIZE);
+ 
+ 		if (write && copy_from_user(page, buf, this_len)) {
+ 			copied = -EFAULT;
+ 			break;
+ 		}
+ 
+ 		this_len = access_remote_vm(mm, addr, page, this_len, flags);
+ 		if (!this_len) {
+ 			if (!copied)
+ 				copied = -EIO;
+ 			break;
+ 		}
+ 
+ 		if (!write && copy_to_user(buf, page, this_len)) {
+ 			copied = -EFAULT;
+ 			break;
+ 		}
+ 
+ 		buf += this_len;
+ 		addr += this_len;
+ 		copied += this_len;
+ 		count -= this_len;
+ 	}
+ 	*ppos = addr;
+ 
+ 	mmput(mm);
+ free:
+ 	free_page((unsigned long) page);
+ 	return copied;
+ }
+ 
+ static ssize_t mem_read(struct file *file, char __user *buf,
+ 			size_t count, loff_t *ppos)
+ {
+ 	return mem_rw(file, buf, count, ppos, 0);
+ }
+ 
+ static ssize_t mem_write(struct file *file, const char __user *buf,
+ 			 size_t count, loff_t *ppos)
+ {
+ 	return mem_rw(file, (char __user*)buf, count, ppos, 1);
+ }
+ 
+ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
+ {
+ 	switch (orig) {
+ 	case 0:
+ 		file->f_pos = offset;
+ 		break;
+ 	case 1:
+ 		file->f_pos += offset;
+ 		break;
+ 	default:
+ 		return -EINVAL;
+ 	}
+ 	force_successful_syscall_return();
+ 	return file->f_pos;
+ }
+ 
+ static int mem_release(struct inode *inode, struct file *file)
+ {
+ 	struct mm_struct *mm = file->private_data;
+ 	if (mm)
+ 		mmdrop(mm);
+ 	return 0;
+ }
+ 
+ static const struct file_operations proc_mem_operations = {
+ 	.llseek		= mem_lseek,
+ 	.read		= mem_read,
+ 	.write		= mem_write,
+ 	.open		= mem_open,
+ 	.release	= mem_release,
+ };
+ 
+ static int environ_open(struct inode *inode, struct file *file)
+ {
+ 	return __mem_open(inode, file, PTRACE_MODE_READ);
+ }
+ 
+ static ssize_t environ_read(struct file *file, char __user *buf,
+ 			size_t count, loff_t *ppos)
+ {
+ 	char *page;
+ 	unsigned long src = *ppos;
+ 	int ret = 0;
+ 	struct mm_struct *mm = file->private_data;
+ 	unsigned long env_start, env_end;
+ 
+ 	/* Ensure the process spawned far enough to have an environment. */
+ 	if (!mm || !mm->env_end)
+ 		return 0;
+ 
+ 	page = (char *)__get_free_page(GFP_KERNEL);
+ 	if (!page)
+ 		return -ENOMEM;
+ 
+ 	ret = 0;
+ 	if (!mmget_not_zero(mm))
+ 		goto free;
+ 
+ 	spin_lock(&mm->arg_lock);
+ 	env_start = mm->env_start;
+ 	env_end = mm->env_end;
+ 	spin_unlock(&mm->arg_lock);
+ 
+ 	while (count > 0) {
+ 		size_t this_len, max_len;
+ 		int retval;
+ 
+ 		if (src >= (env_end - env_start))
+ 			break;
+ 
+ 		this_len = env_end - (env_start + src);
+ 
+ 		max_len = min_t(size_t, PAGE_SIZE, count);
+ 		this_len = min(max_len, this_len);
+ 
+ 		retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
+ 
+ 		if (retval <= 0) {
+ 			ret = retval;
+ 			break;
+ 		}
+ 
+ 		if (copy_to_user(buf, page, retval)) {
+ 			ret = -EFAULT;
+ 			break;
+ 		}
+ 
+ 		ret += retval;
+ 		src += retval;
+ 		buf += retval;
+ 		count -= retval;
+ 	}
+ 	*ppos = src;
+ 	mmput(mm);
+ 
+ free:
+ 	free_page((unsigned long) page);
+ 	return ret;
+ }
+ 
+ static const struct file_operations proc_environ_operations = {
+ 	.open		= environ_open,
+ 	.read		= environ_read,
+ 	.llseek		= generic_file_llseek,
+ 	.release	= mem_release,
+ };
+ 
+ static int auxv_open(struct inode *inode, struct file *file)
+ {
+ 	return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+ }
+ 
+ static ssize_t auxv_read(struct file *file, char __user *buf,
+ 			size_t count, loff_t *ppos)
+ {
+ 	struct mm_struct *mm = file->private_data;
+ 	unsigned int nwords = 0;
+ 
+ 	if (!mm)
+ 		return 0;
+ 	do {
+ 		nwords += 2;
+ 	} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+ 	return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
+ 				       nwords * sizeof(mm->saved_auxv[0]));
+ }
+ 
+ static const struct file_operations proc_auxv_operations = {
+ 	.open		= auxv_open,
+ 	.read		= auxv_read,
+ 	.llseek		= generic_file_llseek,
+ 	.release	= mem_release,
+ };
+ 
+ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+ 			    loff_t *ppos)
+ {
+ 	struct task_struct *task = get_proc_task(file_inode(file));
+ 	char buffer[PROC_NUMBUF];
+ 	int oom_adj = OOM_ADJUST_MIN;
+ 	size_t len;
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ 		oom_adj = OOM_ADJUST_MAX;
+ 	else
+ 		oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ 			  OOM_SCORE_ADJ_MAX;
+ 	put_task_struct(task);
+ 	len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+ 	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+ }
+ 
+ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
+ {
+ 	struct mm_struct *mm = NULL;
+ 	struct task_struct *task;
+ 	int err = 0;
+ 
+ 	task = get_proc_task(file_inode(file));
+ 	if (!task)
+ 		return -ESRCH;
+ 
+ 	mutex_lock(&oom_adj_mutex);
+ 	if (legacy) {
+ 		if (oom_adj < task->signal->oom_score_adj &&
+ 				!capable(CAP_SYS_RESOURCE)) {
+ 			err = -EACCES;
+ 			goto err_unlock;
+ 		}
+ 		/*
+ 		 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ 		 * /proc/pid/oom_score_adj instead.
+ 		 */
+ 		pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ 			  current->comm, task_pid_nr(current), task_pid_nr(task),
+ 			  task_pid_nr(task));
+ 	} else {
+ 		if ((short)oom_adj < task->signal->oom_score_adj_min &&
+ 				!capable(CAP_SYS_RESOURCE)) {
+ 			err = -EACCES;
+ 			goto err_unlock;
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Make sure we will check other processes sharing the mm if this is
+ 	 * not vfrok which wants its own oom_score_adj.
+ 	 * pin the mm so it doesn't go away and get reused after task_unlock
+ 	 */
+ 	if (!task->vfork_done) {
+ 		struct task_struct *p = find_lock_task_mm(task);
+ 
+ 		if (p) {
+ 			if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
+ 				mm = p->mm;
+ 				mmgrab(mm);
+ 			}
+ 			task_unlock(p);
+ 		}
+ 	}
+ 
+ 	task->signal->oom_score_adj = oom_adj;
+ 	if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ 		task->signal->oom_score_adj_min = (short)oom_adj;
+ 	trace_oom_score_adj_update(task);
+ 
+ 	if (mm) {
+ 		struct task_struct *p;
+ 
+ 		rcu_read_lock();
+ 		for_each_process(p) {
+ 			if (same_thread_group(task, p))
+ 				continue;
+ 
+ 			/* do not touch kernel threads or the global init */
+ 			if (p->flags & PF_KTHREAD || is_global_init(p))
+ 				continue;
+ 
+ 			task_lock(p);
+ 			if (!p->vfork_done && process_shares_mm(p, mm)) {
+ 				p->signal->oom_score_adj = oom_adj;
+ 				if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ 					p->signal->oom_score_adj_min = (short)oom_adj;
+ 			}
+ 			task_unlock(p);
+ 		}
+ 		rcu_read_unlock();
+ 		mmdrop(mm);
+ 	}
+ err_unlock:
+ 	mutex_unlock(&oom_adj_mutex);
+ 	put_task_struct(task);
+ 	return err;
+ }
+ 
+ /*
+  * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+  * kernels.  The effective policy is defined by oom_score_adj, which has a
+  * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+  * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+  * Processes that become oom disabled via oom_adj will still be oom disabled
+  * with this implementation.
+  *
+  * oom_adj cannot be removed since existing userspace binaries use it.
+  */
+ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+ 			     size_t count, loff_t *ppos)
+ {
+ 	char buffer[PROC_NUMBUF];
+ 	int oom_adj;
+ 	int err;
+ 
+ 	memset(buffer, 0, sizeof(buffer));
+ 	if (count > sizeof(buffer) - 1)
+ 		count = sizeof(buffer) - 1;
+ 	if (copy_from_user(buffer, buf, count)) {
+ 		err = -EFAULT;
+ 		goto out;
+ 	}
+ 
+ 	err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+ 	if (err)
+ 		goto out;
+ 	if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+ 	     oom_adj != OOM_DISABLE) {
+ 		err = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	/*
+ 	 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+ 	 * value is always attainable.
+ 	 */
+ 	if (oom_adj == OOM_ADJUST_MAX)
+ 		oom_adj = OOM_SCORE_ADJ_MAX;
+ 	else
+ 		oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+ 
+ 	err = __set_oom_adj(file, oom_adj, true);
+ out:
+ 	return err < 0 ? err : count;
+ }
+ 
+ static const struct file_operations proc_oom_adj_operations = {
+ 	.read		= oom_adj_read,
+ 	.write		= oom_adj_write,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
+ 					size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task = get_proc_task(file_inode(file));
+ 	char buffer[PROC_NUMBUF];
+ 	short oom_score_adj = OOM_SCORE_ADJ_MIN;
+ 	size_t len;
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	oom_score_adj = task->signal->oom_score_adj;
+ 	put_task_struct(task);
+ 	len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
+ 	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+ }
+ 
+ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
+ 					size_t count, loff_t *ppos)
+ {
+ 	char buffer[PROC_NUMBUF];
+ 	int oom_score_adj;
+ 	int err;
+ 
+ 	memset(buffer, 0, sizeof(buffer));
+ 	if (count > sizeof(buffer) - 1)
+ 		count = sizeof(buffer) - 1;
+ 	if (copy_from_user(buffer, buf, count)) {
+ 		err = -EFAULT;
+ 		goto out;
+ 	}
+ 
+ 	err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
+ 	if (err)
+ 		goto out;
+ 	if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
+ 			oom_score_adj > OOM_SCORE_ADJ_MAX) {
+ 		err = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	err = __set_oom_adj(file, oom_score_adj, false);
+ out:
+ 	return err < 0 ? err : count;
+ }
+ 
+ static const struct file_operations proc_oom_score_adj_operations = {
+ 	.read		= oom_score_adj_read,
+ 	.write		= oom_score_adj_write,
+ 	.llseek		= default_llseek,
+ };
+ 
+ #ifdef CONFIG_AUDIT
+ #define TMPBUFLEN 11
+ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
+ 				  size_t count, loff_t *ppos)
+ {
+ 	struct inode * inode = file_inode(file);
+ 	struct task_struct *task = get_proc_task(inode);
+ 	ssize_t length;
+ 	char tmpbuf[TMPBUFLEN];
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+ 			   from_kuid(file->f_cred->user_ns,
+ 				     audit_get_loginuid(task)));
+ 	put_task_struct(task);
+ 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+ }
+ 
+ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
+ 				   size_t count, loff_t *ppos)
+ {
+ 	struct inode * inode = file_inode(file);
+ 	uid_t loginuid;
+ 	kuid_t kloginuid;
+ 	int rv;
+ 
+ 	rcu_read_lock();
+ 	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+ 		rcu_read_unlock();
+ 		return -EPERM;
+ 	}
+ 	rcu_read_unlock();
+ 
+ 	if (*ppos != 0) {
+ 		/* No partial writes. */
+ 		return -EINVAL;
+ 	}
+ 
+ 	rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+ 	if (rv < 0)
+ 		return rv;
+ 
+ 	/* is userspace tring to explicitly UNSET the loginuid? */
+ 	if (loginuid == AUDIT_UID_UNSET) {
+ 		kloginuid = INVALID_UID;
+ 	} else {
+ 		kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+ 		if (!uid_valid(kloginuid))
+ 			return -EINVAL;
+ 	}
+ 
+ 	rv = audit_set_loginuid(kloginuid);
+ 	if (rv < 0)
+ 		return rv;
+ 	return count;
+ }
+ 
+ static const struct file_operations proc_loginuid_operations = {
+ 	.read		= proc_loginuid_read,
+ 	.write		= proc_loginuid_write,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
+ 				  size_t count, loff_t *ppos)
+ {
+ 	struct inode * inode = file_inode(file);
+ 	struct task_struct *task = get_proc_task(inode);
+ 	ssize_t length;
+ 	char tmpbuf[TMPBUFLEN];
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+ 				audit_get_sessionid(task));
+ 	put_task_struct(task);
+ 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
+ }
+ 
+ static const struct file_operations proc_sessionid_operations = {
+ 	.read		= proc_sessionid_read,
+ 	.llseek		= generic_file_llseek,
+ };
+ #endif
+ 
+ #ifdef CONFIG_FAULT_INJECTION
+ static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
+ 				      size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task = get_proc_task(file_inode(file));
+ 	char buffer[PROC_NUMBUF];
+ 	size_t len;
+ 	int make_it_fail;
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 	make_it_fail = task->make_it_fail;
+ 	put_task_struct(task);
+ 
+ 	len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
+ 
+ 	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+ }
+ 
+ static ssize_t proc_fault_inject_write(struct file * file,
+ 			const char __user * buf, size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task;
+ 	char buffer[PROC_NUMBUF];
+ 	int make_it_fail;
+ 	int rv;
+ 
+ 	if (!capable(CAP_SYS_RESOURCE))
+ 		return -EPERM;
+ 	memset(buffer, 0, sizeof(buffer));
+ 	if (count > sizeof(buffer) - 1)
+ 		count = sizeof(buffer) - 1;
+ 	if (copy_from_user(buffer, buf, count))
+ 		return -EFAULT;
+ 	rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+ 	if (rv < 0)
+ 		return rv;
+ 	if (make_it_fail < 0 || make_it_fail > 1)
+ 		return -EINVAL;
+ 
+ 	task = get_proc_task(file_inode(file));
+ 	if (!task)
+ 		return -ESRCH;
+ 	task->make_it_fail = make_it_fail;
+ 	put_task_struct(task);
+ 
+ 	return count;
+ }
+ 
+ static const struct file_operations proc_fault_inject_operations = {
+ 	.read		= proc_fault_inject_read,
+ 	.write		= proc_fault_inject_write,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
+ 				   size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task;
+ 	int err;
+ 	unsigned int n;
+ 
+ 	err = kstrtouint_from_user(buf, count, 0, &n);
+ 	if (err)
+ 		return err;
+ 
+ 	task = get_proc_task(file_inode(file));
+ 	if (!task)
+ 		return -ESRCH;
+ 	task->fail_nth = n;
+ 	put_task_struct(task);
+ 
+ 	return count;
+ }
+ 
+ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
+ 				  size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task;
+ 	char numbuf[PROC_NUMBUF];
+ 	ssize_t len;
+ 
+ 	task = get_proc_task(file_inode(file));
+ 	if (!task)
+ 		return -ESRCH;
+ 	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
+ 	put_task_struct(task);
+ 	return simple_read_from_buffer(buf, count, ppos, numbuf, len);
+ }
+ 
+ static const struct file_operations proc_fail_nth_operations = {
+ 	.read		= proc_fail_nth_read,
+ 	.write		= proc_fail_nth_write,
+ };
+ #endif
+ 
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+ /*
+  * Print out various scheduling related per-task fields:
+  */
+ static int sched_show(struct seq_file *m, void *v)
+ {
+ 	struct inode *inode = m->private;
+ 	struct pid_namespace *ns = proc_pid_ns(inode);
+ 	struct task_struct *p;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 	proc_sched_show_task(p, ns, m);
+ 
+ 	put_task_struct(p);
+ 
+ 	return 0;
+ }
+ 
+ static ssize_t
+ sched_write(struct file *file, const char __user *buf,
+ 	    size_t count, loff_t *offset)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	struct task_struct *p;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 	proc_sched_set_task(p);
+ 
+ 	put_task_struct(p);
+ 
+ 	return count;
+ }
+ 
+ static int sched_open(struct inode *inode, struct file *filp)
+ {
+ 	return single_open(filp, sched_show, inode);
+ }
+ 
+ static const struct file_operations proc_pid_sched_operations = {
+ 	.open		= sched_open,
+ 	.read		= seq_read,
+ 	.write		= sched_write,
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+ 
+ #endif
+ 
+ #ifdef CONFIG_SCHED_AUTOGROUP
+ /*
+  * Print out autogroup related information:
+  */
+ static int sched_autogroup_show(struct seq_file *m, void *v)
+ {
+ 	struct inode *inode = m->private;
+ 	struct task_struct *p;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 	proc_sched_autogroup_show_task(p, m);
+ 
+ 	put_task_struct(p);
+ 
+ 	return 0;
+ }
+ 
+ static ssize_t
+ sched_autogroup_write(struct file *file, const char __user *buf,
+ 	    size_t count, loff_t *offset)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	struct task_struct *p;
+ 	char buffer[PROC_NUMBUF];
+ 	int nice;
+ 	int err;
+ 
+ 	memset(buffer, 0, sizeof(buffer));
+ 	if (count > sizeof(buffer) - 1)
+ 		count = sizeof(buffer) - 1;
+ 	if (copy_from_user(buffer, buf, count))
+ 		return -EFAULT;
+ 
+ 	err = kstrtoint(strstrip(buffer), 0, &nice);
+ 	if (err < 0)
+ 		return err;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 
+ 	err = proc_sched_autogroup_set_nice(p, nice);
+ 	if (err)
+ 		count = err;
+ 
+ 	put_task_struct(p);
+ 
+ 	return count;
+ }
+ 
+ static int sched_autogroup_open(struct inode *inode, struct file *filp)
+ {
+ 	int ret;
+ 
+ 	ret = single_open(filp, sched_autogroup_show, NULL);
+ 	if (!ret) {
+ 		struct seq_file *m = filp->private_data;
+ 
+ 		m->private = inode;
+ 	}
+ 	return ret;
+ }
+ 
+ static const struct file_operations proc_pid_sched_autogroup_operations = {
+ 	.open		= sched_autogroup_open,
+ 	.read		= seq_read,
+ 	.write		= sched_autogroup_write,
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+ 
+ #endif /* CONFIG_SCHED_AUTOGROUP */
+ 
+ static ssize_t comm_write(struct file *file, const char __user *buf,
+ 				size_t count, loff_t *offset)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	struct task_struct *p;
+ 	char buffer[TASK_COMM_LEN];
+ 	const size_t maxlen = sizeof(buffer) - 1;
+ 
+ 	memset(buffer, 0, sizeof(buffer));
+ 	if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
+ 		return -EFAULT;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 
+ 	if (same_thread_group(current, p))
+ 		set_task_comm(p, buffer);
+ 	else
+ 		count = -EINVAL;
+ 
+ 	put_task_struct(p);
+ 
+ 	return count;
+ }
+ 
+ static int comm_show(struct seq_file *m, void *v)
+ {
+ 	struct inode *inode = m->private;
+ 	struct task_struct *p;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 
+ 	proc_task_name(m, p, false);
+ 	seq_putc(m, '\n');
+ 
+ 	put_task_struct(p);
+ 
+ 	return 0;
+ }
+ 
+ static int comm_open(struct inode *inode, struct file *filp)
+ {
+ 	return single_open(filp, comm_show, inode);
+ }
+ 
+ static const struct file_operations proc_pid_set_comm_operations = {
+ 	.open		= comm_open,
+ 	.read		= seq_read,
+ 	.write		= comm_write,
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+ 
+ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
+ {
+ 	struct task_struct *task;
+ 	struct file *exe_file;
+ 
+ 	task = get_proc_task(d_inode(dentry));
+ 	if (!task)
+ 		return -ENOENT;
+ 	exe_file = get_task_exe_file(task);
+ 	put_task_struct(task);
+ 	if (exe_file) {
+ 		*exe_path = exe_file->f_path;
+ 		path_get(&exe_file->f_path);
+ 		fput(exe_file);
+ 		return 0;
+ 	} else
+ 		return -ENOENT;
+ }
+ 
+ static const char *proc_pid_get_link(struct dentry *dentry,
+ 				     struct inode *inode,
+ 				     struct delayed_call *done)
+ {
+ 	struct path path;
+ 	int error = -EACCES;
+ 
+ 	if (!dentry)
+ 		return ERR_PTR(-ECHILD);
+ 
+ 	/* Are we allowed to snoop on the tasks file descriptors? */
+ 	if (!proc_fd_access_allowed(inode))
+ 		goto out;
+ 
+ 	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+ 	if (error)
+ 		goto out;
+ 
+ 	nd_jump_link(&path);
+ 	return NULL;
+ out:
+ 	return ERR_PTR(error);
+ }
+ 
+ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
+ {
+ 	char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ 	char *pathname;
+ 	int len;
+ 
+ 	if (!tmp)
+ 		return -ENOMEM;
+ 
+ 	pathname = d_path(path, tmp, PAGE_SIZE);
+ 	len = PTR_ERR(pathname);
+ 	if (IS_ERR(pathname))
+ 		goto out;
+ 	len = tmp + PAGE_SIZE - 1 - pathname;
+ 
+ 	if (len > buflen)
+ 		len = buflen;
+ 	if (copy_to_user(buffer, pathname, len))
+ 		len = -EFAULT;
+  out:
+ 	free_page((unsigned long)tmp);
+ 	return len;
+ }
+ 
+ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
+ {
+ 	int error = -EACCES;
+ 	struct inode *inode = d_inode(dentry);
+ 	struct path path;
+ 
+ 	/* Are we allowed to snoop on the tasks file descriptors? */
+ 	if (!proc_fd_access_allowed(inode))
+ 		goto out;
+ 
+ 	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+ 	if (error)
+ 		goto out;
+ 
+ 	error = do_proc_readlink(&path, buffer, buflen);
+ 	path_put(&path);
+ out:
+ 	return error;
+ }
+ 
+ const struct inode_operations proc_pid_link_inode_operations = {
+ 	.readlink	= proc_pid_readlink,
+ 	.get_link	= proc_pid_get_link,
+ 	.setattr	= proc_setattr,
+ };
+ 
+ 
+ /* building an inode */
+ 
+ void task_dump_owner(struct task_struct *task, umode_t mode,
+ 		     kuid_t *ruid, kgid_t *rgid)
+ {
+ 	/* Depending on the state of dumpable compute who should own a
+ 	 * proc file for a task.
+ 	 */
+ 	const struct cred *cred;
+ 	kuid_t uid;
+ 	kgid_t gid;
+ 
+ 	if (unlikely(task->flags & PF_KTHREAD)) {
+ 		*ruid = GLOBAL_ROOT_UID;
+ 		*rgid = GLOBAL_ROOT_GID;
+ 		return;
+ 	}
+ 
+ 	/* Default to the tasks effective ownership */
+ 	rcu_read_lock();
+ 	cred = __task_cred(task);
+ 	uid = cred->euid;
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ 	gid = grsec_proc_gid;
+ #else
+ 	gid = cred->egid;
+ #endif
+ 	rcu_read_unlock();
+ 
+ 	/*
+ 	 * Before the /proc/pid/status file was created the only way to read
+ 	 * the effective uid of a /process was to stat /proc/pid.  Reading
+ 	 * /proc/pid/status is slow enough that procps and other packages
+ 	 * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ 	 * made this apply to all per process world readable and executable
+ 	 * directories.
+ 	 */
+ 	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
+ 		struct mm_struct *mm;
+ 		task_lock(task);
+ 		mm = task->mm;
+ 		/* Make non-dumpable tasks owned by some root */
+ 		if (mm) {
+ 			if (get_dumpable(mm) != SUID_DUMP_USER) {
+ 				struct user_namespace *user_ns = mm->user_ns;
+ 
+ 				uid = make_kuid(user_ns, 0);
+ 				if (!uid_valid(uid))
+ 					uid = GLOBAL_ROOT_UID;
+ 
+ 				gid = make_kgid(user_ns, 0);
+ 				if (!gid_valid(gid))
+ 					gid = GLOBAL_ROOT_GID;
+ 			}
+ 		} else {
+ 			uid = GLOBAL_ROOT_UID;
+ 			gid = GLOBAL_ROOT_GID;
+ 		}
+ 		task_unlock(task);
+ 	}
+ 	*ruid = uid;
+ 	*rgid = gid;
+ }
+ 
+ struct inode *proc_pid_make_inode(struct super_block * sb,
+ 				  struct task_struct *task, umode_t mode)
+ {
+ 	struct inode * inode;
+ 	struct proc_inode *ei;
+ 
+ 	/* We need a new inode */
+ 
+ 	inode = new_inode(sb);
+ 	if (!inode)
+ 		goto out;
+ 
+ 	/* Common stuff */
+ 	ei = PROC_I(inode);
+ 	inode->i_mode = mode;
+ 	inode->i_ino = get_next_ino();
+ 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ 	inode->i_op = &proc_def_inode_operations;
+ 
+ 	/*
+ 	 * grab the reference to task.
+ 	 */
+ 	ei->pid = get_task_pid(task, PIDTYPE_PID);
+ 	if (!ei->pid)
+ 		goto out_unlock;
+ 
+ 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+ 	security_task_to_inode(task, inode);
+ 
+ out:
+ 	return inode;
+ 
+ out_unlock:
+ 	iput(inode);
+ 	return NULL;
+ }
+ 
+ int pid_getattr(const struct path *path, struct kstat *stat,
+ 		u32 request_mask, unsigned int query_flags)
+ {
+ 	struct inode *inode = d_inode(path->dentry);
+ 	struct pid_namespace *pid = proc_pid_ns(inode);
+ 	struct task_struct *task;
+ 
+ 	generic_fillattr(inode, stat);
+ 
+ 	stat->uid = GLOBAL_ROOT_UID;
+ 	stat->gid = GLOBAL_ROOT_GID;
+ 	rcu_read_lock();
+ 	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ 	if (task) {
+ 		if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
+ 			rcu_read_unlock();
+ 			/*
+ 			 * This doesn't prevent learning whether PID exists,
+ 			 * it only makes getattr() consistent with readdir().
+ 			 */
+ 			return -ENOENT;
+ 		}
+ 		task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
+ 	}
+ 	rcu_read_unlock();
+ 	return 0;
+ }
+ 
+ /* dentry stuff */
+ 
+ /*
+  * Set <pid>/... inode ownership (can change due to setuid(), etc.)
+  */
+ void pid_update_inode(struct task_struct *task, struct inode *inode)
+ {
+ 	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+ 
+ 	inode->i_mode &= ~(S_ISUID | S_ISGID);
+ 	security_task_to_inode(task, inode);
+ }
+ 
+ /*
+  * Rewrite the inode's ownerships here because the owning task may have
+  * performed a setuid(), etc.
+  *
+  */
+ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
+ {
+ 	struct inode *inode;
+ 	struct task_struct *task;
+ 
+ 	if (flags & LOOKUP_RCU)
+ 		return -ECHILD;
+ 
+ 	inode = d_inode(dentry);
+ 	task = get_proc_task(inode);
+ 
+ 	if (task) {
+ 		pid_update_inode(task, inode);
+ 		put_task_struct(task);
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ static inline bool proc_inode_is_dead(struct inode *inode)
+ {
+ 	return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+ }
+ 
+ int pid_delete_dentry(const struct dentry *dentry)
+ {
+ 	/* Is the task we represent dead?
+ 	 * If so, then don't put the dentry on the lru list,
+ 	 * kill it immediately.
+ 	 */
+ 	return proc_inode_is_dead(d_inode(dentry));
+ }
+ 
+ const struct dentry_operations pid_dentry_operations =
+ {
+ 	.d_revalidate	= pid_revalidate,
+ 	.d_delete	= pid_delete_dentry,
+ };
+ 
+ /* Lookups */
+ 
+ /*
+  * Fill a directory entry.
+  *
+  * If possible create the dcache entry and derive our inode number and
+  * file type from dcache entry.
+  *
+  * Since all of the proc inode numbers are dynamically generated, the inode
+  * numbers do not exist until the inode is cache.  This means creating the
+  * the dcache entry in readdir is necessary to keep the inode numbers
+  * reported by readdir in sync with the inode numbers reported
+  * by stat.
+  */
+ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
+ 	const char *name, unsigned int len,
+ 	instantiate_t instantiate, struct task_struct *task, const void *ptr)
+ {
+ 	struct dentry *child, *dir = file->f_path.dentry;
+ 	struct qstr qname = QSTR_INIT(name, len);
+ 	struct inode *inode;
+ 	unsigned type = DT_UNKNOWN;
+ 	ino_t ino = 1;
+ 
+ 	child = d_hash_and_lookup(dir, &qname);
+ 	if (!child) {
+ 		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ 		child = d_alloc_parallel(dir, &qname, &wq);
+ 		if (IS_ERR(child))
+ 			goto end_instantiate;
+ 		if (d_in_lookup(child)) {
+ 			struct dentry *res;
+ 			res = instantiate(child, task, ptr);
+ 			d_lookup_done(child);
+ 			if (unlikely(res)) {
+ 				dput(child);
+ 				child = res;
+ 				if (IS_ERR(child))
+ 					goto end_instantiate;
+ 			}
+ 		}
+ 	}
+ 	inode = d_inode(child);
+ 	ino = inode->i_ino;
+ 	type = inode->i_mode >> 12;
+ 	dput(child);
+ end_instantiate:
+ 	return dir_emit(ctx, name, len, ino, type);
+ }
+ 
+ /*
+  * dname_to_vma_addr - maps a dentry name into two unsigned longs
+  * which represent vma start and end addresses.
+  */
+ static int dname_to_vma_addr(struct dentry *dentry,
+ 			     unsigned long *start, unsigned long *end)
+ {
+ 	const char *str = dentry->d_name.name;
+ 	unsigned long long sval, eval;
+ 	unsigned int len;
+ 
+ 	if (str[0] == '0' && str[1] != '-')
+ 		return -EINVAL;
+ 	len = _parse_integer(str, 16, &sval);
+ 	if (len & KSTRTOX_OVERFLOW)
+ 		return -EINVAL;
+ 	if (sval != (unsigned long)sval)
+ 		return -EINVAL;
+ 	str += len;
+ 
+ 	if (*str != '-')
+ 		return -EINVAL;
+ 	str++;
+ 
+ 	if (str[0] == '0' && str[1])
+ 		return -EINVAL;
+ 	len = _parse_integer(str, 16, &eval);
+ 	if (len & KSTRTOX_OVERFLOW)
+ 		return -EINVAL;
+ 	if (eval != (unsigned long)eval)
+ 		return -EINVAL;
+ 	str += len;
+ 
+ 	if (*str != '\0')
+ 		return -EINVAL;
+ 
+ 	*start = sval;
+ 	*end = eval;
+ 
+ 	return 0;
+ }
+ 
+ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
+ {
+ 	unsigned long vm_start, vm_end;
+ 	bool exact_vma_exists = false;
+ 	struct mm_struct *mm = NULL;
+ 	struct task_struct *task;
+ 	struct inode *inode;
+ 	int status = 0;
+ 
+ 	if (flags & LOOKUP_RCU)
+ 		return -ECHILD;
+ 
+ 	inode = d_inode(dentry);
+ 	task = get_proc_task(inode);
+ 	if (!task)
+ 		goto out_notask;
+ 
+ 	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+ 	if (IS_ERR_OR_NULL(mm))
+ 		goto out;
+ 
+ 	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ 		status = down_read_killable(&mm->mmap_sem);
+ 		if (!status) {
+ 			exact_vma_exists = !!find_exact_vma(mm, vm_start,
+ 							    vm_end);
+ 			up_read(&mm->mmap_sem);
+ 		}
+ 	}
+ 
+ 	mmput(mm);
+ 
+ 	if (exact_vma_exists) {
+ 		task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+ 
+ 		security_task_to_inode(task, inode);
+ 		status = 1;
+ 	}
+ 
+ out:
+ 	put_task_struct(task);
+ 
+ out_notask:
+ 	return status;
+ }
+ 
+ static const struct dentry_operations tid_map_files_dentry_operations = {
+ 	.d_revalidate	= map_files_d_revalidate,
+ 	.d_delete	= pid_delete_dentry,
+ };
+ 
+ static int map_files_get_link(struct dentry *dentry, struct path *path)
+ {
+ 	unsigned long vm_start, vm_end;
+ 	struct vm_area_struct *vma;
+ 	struct task_struct *task;
+ 	struct mm_struct *mm;
+ 	int rc;
+ 
+ 	rc = -ENOENT;
+ 	task = get_proc_task(d_inode(dentry));
+ 	if (!task)
+ 		goto out;
+ 
+ 	mm = get_task_mm(task);
+ 	put_task_struct(task);
+ 	if (!mm)
+ 		goto out;
+ 
+ 	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ 	if (rc)
+ 		goto out_mmput;
+ 
+ 	rc = down_read_killable(&mm->mmap_sem);
+ 	if (rc)
+ 		goto out_mmput;
+ 
+ 	rc = -ENOENT;
+ 	vma = find_exact_vma(mm, vm_start, vm_end);
+ 	if (vma && vma->vm_file) {
+ 		*path = vma->vm_file->f_path;
+ 		path_get(path);
+ 		rc = 0;
+ 	}
+ 	up_read(&mm->mmap_sem);
+ 
+ out_mmput:
+ 	mmput(mm);
+ out:
+ 	return rc;
+ }
+ 
+ struct map_files_info {
+ 	unsigned long	start;
+ 	unsigned long	end;
+ 	fmode_t		mode;
+ };
+ 
+ /*
+  * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+  * symlinks may be used to bypass permissions on ancestor directories in the
+  * path to the file in question.
+  */
+ static const char *
+ proc_map_files_get_link(struct dentry *dentry,
+ 			struct inode *inode,
+ 		        struct delayed_call *done)
+ {
+ 	if (!capable(CAP_SYS_ADMIN))
+ 		return ERR_PTR(-EPERM);
+ 
+ 	return proc_pid_get_link(dentry, inode, done);
+ }
+ 
+ /*
+  * Identical to proc_pid_link_inode_operations except for get_link()
+  */
+ static const struct inode_operations proc_map_files_link_inode_operations = {
+ 	.readlink	= proc_pid_readlink,
+ 	.get_link	= proc_map_files_get_link,
+ 	.setattr	= proc_setattr,
+ };
+ 
+ static struct dentry *
+ proc_map_files_instantiate(struct dentry *dentry,
+ 			   struct task_struct *task, const void *ptr)
+ {
+ 	fmode_t mode = (fmode_t)(unsigned long)ptr;
+ 	struct proc_inode *ei;
+ 	struct inode *inode;
+ 
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
+ 				    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
+ 				    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
+ 	if (!inode)
+ 		return ERR_PTR(-ENOENT);
+ 
+ 	ei = PROC_I(inode);
+ 	ei->op.proc_get_link = map_files_get_link;
+ 
+ 	inode->i_op = &proc_map_files_link_inode_operations;
+ 	inode->i_size = 64;
+ 
+ 	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+ 	return d_splice_alias(inode, dentry);
+ }
+ 
+ static struct dentry *proc_map_files_lookup(struct inode *dir,
+ 		struct dentry *dentry, unsigned int flags)
+ {
+ 	unsigned long vm_start, vm_end;
+ 	struct vm_area_struct *vma;
+ 	struct task_struct *task;
+ 	struct dentry *result;
+ 	struct mm_struct *mm;
+ 
+ 	result = ERR_PTR(-ENOENT);
+ 	task = get_proc_task(dir);
+ 	if (!task)
+ 		goto out;
+ 
+ 	result = ERR_PTR(-EACCES);
+ 	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ 		goto out_put_task;
+ 
+ 	result = ERR_PTR(-ENOENT);
+ 	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ 		goto out_put_task;
+ 
+ 	mm = get_task_mm(task);
+ 	if (!mm)
+ 		goto out_put_task;
+ 
+ 	result = ERR_PTR(-EINTR);
+ 	if (down_read_killable(&mm->mmap_sem))
+ 		goto out_put_mm;
+ 
+ 	result = ERR_PTR(-ENOENT);
+ 	vma = find_exact_vma(mm, vm_start, vm_end);
+ 	if (!vma)
+ 		goto out_no_vma;
+ 
+ 	if (vma->vm_file)
+ 		result = proc_map_files_instantiate(dentry, task,
+ 				(void *)(unsigned long)vma->vm_file->f_mode);
+ 
+ out_no_vma:
+ 	up_read(&mm->mmap_sem);
+ out_put_mm:
+ 	mmput(mm);
+ out_put_task:
+ 	put_task_struct(task);
+ out:
+ 	return result;
+ }
+ 
+ static const struct inode_operations proc_map_files_inode_operations = {
+ 	.lookup		= proc_map_files_lookup,
+ 	.permission	= proc_fd_permission,
+ 	.setattr	= proc_setattr,
+ };
+ 
+ static int
+ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	struct vm_area_struct *vma;
+ 	struct task_struct *task;
+ 	struct mm_struct *mm;
+ 	unsigned long nr_files, pos, i;
+ 	GENRADIX(struct map_files_info) fa;
+ 	struct map_files_info *p;
+ 	int ret;
+ 
+ 	genradix_init(&fa);
+ 
+ 	ret = -ENOENT;
+ 	task = get_proc_task(file_inode(file));
+ 	if (!task)
+ 		goto out;
+ 
+ 	ret = -EACCES;
+ 	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
+ 		goto out_put_task;
+ 
+ 	ret = 0;
+ 	if (!dir_emit_dots(file, ctx))
+ 		goto out_put_task;
+ 
+ 	mm = get_task_mm(task);
+ 	if (!mm)
+ 		goto out_put_task;
+ 
+ 	ret = down_read_killable(&mm->mmap_sem);
+ 	if (ret) {
+ 		mmput(mm);
+ 		goto out_put_task;
+ 	}
+ 
+ 	nr_files = 0;
+ 
+ 	/*
+ 	 * We need two passes here:
+ 	 *
+ 	 *  1) Collect vmas of mapped files with mmap_sem taken
+ 	 *  2) Release mmap_sem and instantiate entries
+ 	 *
+ 	 * otherwise we get lockdep complained, since filldir()
+ 	 * routine might require mmap_sem taken in might_fault().
+ 	 */
+ 
+ 	for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ 		if (!vma->vm_file)
+ 			continue;
+ 		if (++pos <= ctx->pos)
+ 			continue;
+ 
+ 		p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
+ 		if (!p) {
+ 			ret = -ENOMEM;
+ 			up_read(&mm->mmap_sem);
+ 			mmput(mm);
+ 			goto out_put_task;
+ 		}
+ 
+ 		p->start = vma->vm_start;
+ 		p->end = vma->vm_end;
+ 		p->mode = vma->vm_file->f_mode;
+ 	}
+ 	up_read(&mm->mmap_sem);
+ 	mmput(mm);
+ 
+ 	for (i = 0; i < nr_files; i++) {
+ 		char buf[4 * sizeof(long) + 2];	/* max: %lx-%lx\0 */
+ 		unsigned int len;
+ 
+ 		p = genradix_ptr(&fa, i);
+ 		len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
+ 		if (!proc_fill_cache(file, ctx,
+ 				      buf, len,
+ 				      proc_map_files_instantiate,
+ 				      task,
+ 				      (void *)(unsigned long)p->mode))
+ 			break;
+ 		ctx->pos++;
+ 	}
+ 
+ out_put_task:
+ 	put_task_struct(task);
+ out:
+ 	genradix_free(&fa);
+ 	return ret;
+ }
+ 
+ static const struct file_operations proc_map_files_operations = {
+ 	.read		= generic_read_dir,
+ 	.iterate_shared	= proc_map_files_readdir,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
+ struct timers_private {
+ 	struct pid *pid;
+ 	struct task_struct *task;
+ 	struct sighand_struct *sighand;
+ 	struct pid_namespace *ns;
+ 	unsigned long flags;
+ };
+ 
+ static void *timers_start(struct seq_file *m, loff_t *pos)
+ {
+ 	struct timers_private *tp = m->private;
+ 
+ 	tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
+ 	if (!tp->task)
+ 		return ERR_PTR(-ESRCH);
+ 
+ 	tp->sighand = lock_task_sighand(tp->task, &tp->flags);
+ 	if (!tp->sighand)
+ 		return ERR_PTR(-ESRCH);
+ 
+ 	return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ }
+ 
+ static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
+ {
+ 	struct timers_private *tp = m->private;
+ 	return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+ }
+ 
+ static void timers_stop(struct seq_file *m, void *v)
+ {
+ 	struct timers_private *tp = m->private;
+ 
+ 	if (tp->sighand) {
+ 		unlock_task_sighand(tp->task, &tp->flags);
+ 		tp->sighand = NULL;
+ 	}
+ 
+ 	if (tp->task) {
+ 		put_task_struct(tp->task);
+ 		tp->task = NULL;
+ 	}
+ }
+ 
+ static int show_timer(struct seq_file *m, void *v)
+ {
+ 	struct k_itimer *timer;
+ 	struct timers_private *tp = m->private;
+ 	int notify;
+ 	static const char * const nstr[] = {
+ 		[SIGEV_SIGNAL] = "signal",
+ 		[SIGEV_NONE] = "none",
+ 		[SIGEV_THREAD] = "thread",
+ 	};
+ 
+ 	timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ 	notify = timer->it_sigev_notify;
+ 
+ 	seq_printf(m, "ID: %d\n", timer->it_id);
+ 	seq_printf(m, "signal: %d/%px\n",
+ 		   timer->sigq->info.si_signo,
+ 		   timer->sigq->info.si_value.sival_ptr);
+ 	seq_printf(m, "notify: %s/%s.%d\n",
+ 		   nstr[notify & ~SIGEV_THREAD_ID],
+ 		   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+ 		   pid_nr_ns(timer->it_pid, tp->ns));
+ 	seq_printf(m, "ClockID: %d\n", timer->it_clock);
+ 
+ 	return 0;
+ }
+ 
+ static const struct seq_operations proc_timers_seq_ops = {
+ 	.start	= timers_start,
+ 	.next	= timers_next,
+ 	.stop	= timers_stop,
+ 	.show	= show_timer,
+ };
+ 
+ static int proc_timers_open(struct inode *inode, struct file *file)
+ {
+ 	struct timers_private *tp;
+ 
+ 	tp = __seq_open_private(file, &proc_timers_seq_ops,
+ 			sizeof(struct timers_private));
+ 	if (!tp)
+ 		return -ENOMEM;
+ 
+ 	tp->pid = proc_pid(inode);
+ 	tp->ns = proc_pid_ns(inode);
+ 	return 0;
+ }
+ 
+ static const struct file_operations proc_timers_operations = {
+ 	.open		= proc_timers_open,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= seq_release_private,
+ };
+ #endif
+ 
+ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+ 					size_t count, loff_t *offset)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	struct task_struct *p;
+ 	u64 slack_ns;
+ 	int err;
+ 
+ 	err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+ 	if (err < 0)
+ 		return err;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 
+ 	if (p != current) {
+ 		rcu_read_lock();
+ 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ 			rcu_read_unlock();
+ 			count = -EPERM;
+ 			goto out;
+ 		}
+ 		rcu_read_unlock();
+ 
+ 		err = security_task_setscheduler(p);
+ 		if (err) {
+ 			count = err;
+ 			goto out;
+ 		}
+ 	}
+ 
+ 	task_lock(p);
+ 	if (slack_ns == 0)
+ 		p->timer_slack_ns = p->default_timer_slack_ns;
+ 	else
+ 		p->timer_slack_ns = slack_ns;
+ 	task_unlock(p);
+ 
+ out:
+ 	put_task_struct(p);
+ 
+ 	return count;
+ }
+ 
+ static int timerslack_ns_show(struct seq_file *m, void *v)
+ {
+ 	struct inode *inode = m->private;
+ 	struct task_struct *p;
+ 	int err = 0;
+ 
+ 	p = get_proc_task(inode);
+ 	if (!p)
+ 		return -ESRCH;
+ 
+ 	if (p != current) {
+ 		rcu_read_lock();
+ 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+ 			rcu_read_unlock();
+ 			err = -EPERM;
+ 			goto out;
+ 		}
+ 		rcu_read_unlock();
+ 
+ 		err = security_task_getscheduler(p);
+ 		if (err)
+ 			goto out;
+ 	}
+ 
+ 	task_lock(p);
+ 	seq_printf(m, "%llu\n", p->timer_slack_ns);
+ 	task_unlock(p);
+ 
+ out:
+ 	put_task_struct(p);
+ 
+ 	return err;
+ }
+ 
+ static int timerslack_ns_open(struct inode *inode, struct file *filp)
+ {
+ 	return single_open(filp, timerslack_ns_show, inode);
+ }
+ 
+ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+ 	.open		= timerslack_ns_open,
+ 	.read		= seq_read,
+ 	.write		= timerslack_ns_write,
+ 	.llseek		= seq_lseek,
+ 	.release	= single_release,
+ };
+ 
+ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
+ 	struct task_struct *task, const void *ptr)
+ {
+ 	const struct pid_entry *p = ptr;
+ 	struct inode *inode;
+ 	struct proc_inode *ei;
+ 
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
+ 	if (!inode)
+ 		return ERR_PTR(-ENOENT);
+ 
+ 	ei = PROC_I(inode);
+ 	if (S_ISDIR(inode->i_mode))
+ 		set_nlink(inode, 2);	/* Use getattr to fix if necessary */
+ 	if (p->iop)
+ 		inode->i_op = p->iop;
+ 	if (p->fop)
+ 		inode->i_fop = p->fop;
+ 	ei->op = p->op;
+ 	pid_update_inode(task, inode);
+ 	d_set_d_op(dentry, &pid_dentry_operations);
+ 	return d_splice_alias(inode, dentry);
+ }
+ 
+ static struct dentry *proc_pident_lookup(struct inode *dir, 
+ 					 struct dentry *dentry,
+ 					 const struct pid_entry *p,
+ 					 const struct pid_entry *end)
+ {
+ 	struct task_struct *task = get_proc_task(dir);
+ 	struct dentry *res = ERR_PTR(-ENOENT);
+ 
+ 	if (!task)
+ 		goto out_no_task;
+ 
+ 	if (gr_pid_is_chrooted(task))
+ 		goto out_no_task;
+ 
+ 
+ 	/*
+ 	 * Yes, it does not scale. And it should not. Don't add
+ 	 * new entries into /proc/<tgid>/ without very good reasons.
+ 	 */
+ 	for (; p < end; p++) {
+ 		if (p->len != dentry->d_name.len)
+ 			continue;
+ 		if (!memcmp(dentry->d_name.name, p->name, p->len)) {
+ 			res = proc_pident_instantiate(dentry, task, p);
+ 			break;
+ 		}
+ 	}
+ 	put_task_struct(task);
+ out_no_task:
+ 	return res;
+ }
+ 
+ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
+ 		const struct pid_entry *ents, unsigned int nents)
+ {
+ 	struct task_struct *task = get_proc_task(file_inode(file));
+ 	const struct pid_entry *p;
+ 
+ 	if (!task)
+ 		return -ENOENT;
+ 
+ 	if (gr_pid_is_chrooted(task))
+ 		goto out;
+ 	if (!dir_emit_dots(file, ctx))
+ 		goto out;
+ 
+ 	if (ctx->pos >= nents + 2)
+ 		goto out;
+ 
+ 	for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
+ 		if (!proc_fill_cache(file, ctx, p->name, p->len,
+ 				proc_pident_instantiate, task, p))
+ 			break;
+ 		ctx->pos++;
+ 	}
+ out:
+ 	put_task_struct(task);
+ 	return 0;
+ }
+ 
+ #ifdef CONFIG_SECURITY
+ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
+ 				  size_t count, loff_t *ppos)
+ {
+ 	struct inode * inode = file_inode(file);
+ 	char *p = NULL;
+ 	ssize_t length;
+ 	struct task_struct *task = get_proc_task(inode);
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 
+ 	length = security_getprocattr(task, PROC_I(inode)->op.lsm,
+ 				      (char*)file->f_path.dentry->d_name.name,
+ 				      &p);
+ 	put_task_struct(task);
+ 	if (length > 0)
+ 		length = simple_read_from_buffer(buf, count, ppos, p, length);
+ 	kfree(p);
+ 	return length;
+ }
+ 
+ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
+ 				   size_t count, loff_t *ppos)
+ {
+ 	struct inode * inode = file_inode(file);
+ 	struct task_struct *task;
+ 	void *page;
+ 	int rv;
+ 
+ 	rcu_read_lock();
+ 	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ 	if (!task) {
+ 		rcu_read_unlock();
+ 		return -ESRCH;
+ 	}
+ 	/* A task may only write its own attributes. */
+ 	if (current != task) {
+ 		rcu_read_unlock();
+ 		return -EACCES;
+ 	}
+ 	/* Prevent changes to overridden credentials. */
+ 	if (current_cred() != current_real_cred()) {
+ 		rcu_read_unlock();
+ 		return -EBUSY;
+ 	}
+ 	rcu_read_unlock();
+ 
+ 	if (count > PAGE_SIZE)
+ 		count = PAGE_SIZE;
+ 
+ 	/* No partial writes. */
+ 	if (*ppos != 0)
+ 		return -EINVAL;
+ 
+ 	page = memdup_user(buf, count);
+ 	if (IS_ERR(page)) {
+ 		rv = PTR_ERR(page);
+ 		goto out;
+ 	}
+ 
+ 	/* Guard against adverse ptrace interaction */
+ 	rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
+ 	if (rv < 0)
+ 		goto out_free;
+ 
+ 	rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ 				  file->f_path.dentry->d_name.name, page,
+ 				  count);
+ 	mutex_unlock(&current->signal->cred_guard_mutex);
+ out_free:
+ 	kfree(page);
+ out:
+ 	return rv;
+ }
+ 
+ static const struct file_operations proc_pid_attr_operations = {
+ 	.read		= proc_pid_attr_read,
+ 	.write		= proc_pid_attr_write,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ #define LSM_DIR_OPS(LSM) \
+ static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
+ 			     struct dir_context *ctx) \
+ { \
+ 	return proc_pident_readdir(filp, ctx, \
+ 				   LSM##_attr_dir_stuff, \
+ 				   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+ } \
+ \
+ static const struct file_operations proc_##LSM##_attr_dir_ops = { \
+ 	.read		= generic_read_dir, \
+ 	.iterate	= proc_##LSM##_attr_dir_iterate, \
+ 	.llseek		= default_llseek, \
+ }; \
+ \
+ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
+ 				struct dentry *dentry, unsigned int flags) \
+ { \
+ 	return proc_pident_lookup(dir, dentry, \
+ 				  LSM##_attr_dir_stuff, \
+ 				  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
+ } \
+ \
+ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
+ 	.lookup		= proc_##LSM##_attr_dir_lookup, \
+ 	.getattr	= pid_getattr, \
+ 	.setattr	= proc_setattr, \
+ }
+ 
+ #ifdef CONFIG_SECURITY_SMACK
+ static const struct pid_entry smack_attr_dir_stuff[] = {
+ 	ATTR("smack", "current",	0666),
+ };
+ LSM_DIR_OPS(smack);
+ #endif
+ 
+ static const struct pid_entry attr_dir_stuff[] = {
+ 	ATTR(NULL, "current",		0666),
+ 	ATTR(NULL, "prev",		0444),
+ 	ATTR(NULL, "exec",		0666),
+ 	ATTR(NULL, "fscreate",		0666),
+ 	ATTR(NULL, "keycreate",		0666),
+ 	ATTR(NULL, "sockcreate",	0666),
+ #ifdef CONFIG_SECURITY_SMACK
+ 	DIR("smack",			0555,
+ 	    proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
+ #endif
+ };
+ 
+ static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	return proc_pident_readdir(file, ctx, 
+ 				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+ }
+ 
+ static const struct file_operations proc_attr_dir_operations = {
+ 	.read		= generic_read_dir,
+ 	.iterate_shared	= proc_attr_dir_readdir,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ static struct dentry *proc_attr_dir_lookup(struct inode *dir,
+ 				struct dentry *dentry, unsigned int flags)
+ {
+ 	return proc_pident_lookup(dir, dentry,
+ 				  attr_dir_stuff,
+ 				  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
+ }
+ 
+ static const struct inode_operations proc_attr_dir_inode_operations = {
+ 	.lookup		= proc_attr_dir_lookup,
+ 	.getattr	= pid_getattr,
+ 	.setattr	= proc_setattr,
+ };
+ 
+ #endif
+ 
+ #ifdef CONFIG_ELF_CORE
+ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
+ 					 size_t count, loff_t *ppos)
+ {
+ 	struct task_struct *task = get_proc_task(file_inode(file));
+ 	struct mm_struct *mm;
+ 	char buffer[PROC_NUMBUF];
+ 	size_t len;
+ 	int ret;
+ 
+ 	if (!task)
+ 		return -ESRCH;
+ 
+ 	ret = 0;
+ 	mm = get_task_mm(task);
+ 	if (mm) {
+ 		len = snprintf(buffer, sizeof(buffer), "%08lx\n",
+ 			       ((mm->flags & MMF_DUMP_FILTER_MASK) >>
+ 				MMF_DUMP_FILTER_SHIFT));
+ 		mmput(mm);
+ 		ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
+ 	}
+ 
+ 	put_task_struct(task);
+ 
+ 	return ret;
+ }
+ 
+ static ssize_t proc_coredump_filter_write(struct file *file,
+ 					  const char __user *buf,
+ 					  size_t count,
+ 					  loff_t *ppos)
+ {
+ 	struct task_struct *task;
+ 	struct mm_struct *mm;
+ 	unsigned int val;
+ 	int ret;
+ 	int i;
+ 	unsigned long mask;
+ 
+ 	ret = kstrtouint_from_user(buf, count, 0, &val);
+ 	if (ret < 0)
+ 		return ret;
+ 
+ 	ret = -ESRCH;
+ 	task = get_proc_task(file_inode(file));
+ 	if (!task)
+ 		goto out_no_task;
+ 
+ 	mm = get_task_mm(task);
+ 	if (!mm)
+ 		goto out_no_mm;
+ 	ret = 0;
+ 
+ 	for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
+ 		if (val & mask)
+ 			set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ 		else
+ 			clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
+ 	}
+ 
+ 	mmput(mm);
+  out_no_mm:
+ 	put_task_struct(task);
+  out_no_task:
+ 	if (ret < 0)
+ 		return ret;
+ 	return count;
+ }
+ 
+ static const struct file_operations proc_coredump_filter_operations = {
+ 	.read		= proc_coredump_filter_read,
+ 	.write		= proc_coredump_filter_write,
+ 	.llseek		= generic_file_llseek,
+ };
+ #endif
+ 
+ #ifdef CONFIG_TASK_IO_ACCOUNTING
+ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
+ {
+ 	struct task_io_accounting acct = task->ioac;
+ 	unsigned long flags;
+ 	int result;
+ 
+ 	result = down_read_killable(&task->signal->exec_update_lock);
+ 	if (result)
+ 		return result;
+ 
+ 	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
+ 		result = -EACCES;
+ 		goto out_unlock;
+ 	}
+ 
+ 	if (whole && lock_task_sighand(task, &flags)) {
+ 		struct task_struct *t = task;
+ 
+ 		task_io_accounting_add(&acct, &task->signal->ioac);
+ 		while_each_thread(task, t)
+ 			task_io_accounting_add(&acct, &t->ioac);
+ 
+ 		unlock_task_sighand(task, &flags);
+ 	}
+ 	seq_printf(m,
+ 		   "rchar: %llu\n"
+ 		   "wchar: %llu\n"
+ 		   "syscr: %llu\n"
+ 		   "syscw: %llu\n"
+ 		   "read_bytes: %llu\n"
+ 		   "write_bytes: %llu\n"
+ 		   "cancelled_write_bytes: %llu\n",
+ 		   (unsigned long long)acct.rchar,
+ 		   (unsigned long long)acct.wchar,
+ 		   (unsigned long long)acct.syscr,
+ 		   (unsigned long long)acct.syscw,
+ 		   (unsigned long long)acct.read_bytes,
+ 		   (unsigned long long)acct.write_bytes,
+ 		   (unsigned long long)acct.cancelled_write_bytes);
+ 	result = 0;
+ 
+ out_unlock:
+ 	up_read(&task->signal->exec_update_lock);
+ 	return result;
+ }
+ 
+ static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ 				  struct pid *pid, struct task_struct *task)
+ {
+ 	return do_io_accounting(task, m, 0);
+ }
+ 
+ static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+ 				   struct pid *pid, struct task_struct *task)
+ {
+ 	return do_io_accounting(task, m, 1);
+ }
+ #endif /* CONFIG_TASK_IO_ACCOUNTING */
+ 
+ #ifdef CONFIG_USER_NS
+ static int proc_id_map_open(struct inode *inode, struct file *file,
+ 	const struct seq_operations *seq_ops)
+ {
+ 	struct user_namespace *ns = NULL;
+ 	struct task_struct *task;
+ 	struct seq_file *seq;
+ 	int ret = -EINVAL;
+ 
+ 	task = get_proc_task(inode);
+ 	if (task) {
+ 		rcu_read_lock();
+ 		ns = get_user_ns(task_cred_xxx(task, user_ns));
+ 		rcu_read_unlock();
+ 		put_task_struct(task);
+ 	}
+ 	if (!ns)
+ 		goto err;
+ 
+ 	ret = seq_open(file, seq_ops);
+ 	if (ret)
+ 		goto err_put_ns;
+ 
+ 	seq = file->private_data;
+ 	seq->private = ns;
+ 
+ 	return 0;
+ err_put_ns:
+ 	put_user_ns(ns);
+ err:
+ 	return ret;
+ }
+ 
+ static int proc_id_map_release(struct inode *inode, struct file *file)
+ {
+ 	struct seq_file *seq = file->private_data;
+ 	struct user_namespace *ns = seq->private;
+ 	put_user_ns(ns);
+ 	return seq_release(inode, file);
+ }
+ 
+ static int proc_uid_map_open(struct inode *inode, struct file *file)
+ {
+ 	return proc_id_map_open(inode, file, &proc_uid_seq_operations);
+ }
+ 
+ static int proc_gid_map_open(struct inode *inode, struct file *file)
+ {
+ 	return proc_id_map_open(inode, file, &proc_gid_seq_operations);
+ }
+ 
+ static int proc_projid_map_open(struct inode *inode, struct file *file)
+ {
+ 	return proc_id_map_open(inode, file, &proc_projid_seq_operations);
+ }
+ 
+ static const struct file_operations proc_uid_map_operations = {
+ 	.open		= proc_uid_map_open,
+ 	.write		= proc_uid_map_write,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= proc_id_map_release,
+ };
+ 
+ static const struct file_operations proc_gid_map_operations = {
+ 	.open		= proc_gid_map_open,
+ 	.write		= proc_gid_map_write,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= proc_id_map_release,
+ };
+ 
+ static const struct file_operations proc_projid_map_operations = {
+ 	.open		= proc_projid_map_open,
+ 	.write		= proc_projid_map_write,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= proc_id_map_release,
+ };
+ 
+ static int proc_setgroups_open(struct inode *inode, struct file *file)
+ {
+ 	struct user_namespace *ns = NULL;
+ 	struct task_struct *task;
+ 	int ret;
+ 
+ 	ret = -ESRCH;
+ 	task = get_proc_task(inode);
+ 	if (task) {
+ 		rcu_read_lock();
+ 		ns = get_user_ns(task_cred_xxx(task, user_ns));
+ 		rcu_read_unlock();
+ 		put_task_struct(task);
+ 	}
+ 	if (!ns)
+ 		goto err;
+ 
+ 	if (file->f_mode & FMODE_WRITE) {
+ 		ret = -EACCES;
+ 		if (!ns_capable(ns, CAP_SYS_ADMIN))
+ 			goto err_put_ns;
+ 	}
+ 
+ 	ret = single_open(file, &proc_setgroups_show, ns);
+ 	if (ret)
+ 		goto err_put_ns;
+ 
+ 	return 0;
+ err_put_ns:
+ 	put_user_ns(ns);
+ err:
+ 	return ret;
+ }
+ 
+ static int proc_setgroups_release(struct inode *inode, struct file *file)
+ {
+ 	struct seq_file *seq = file->private_data;
+ 	struct user_namespace *ns = seq->private;
+ 	int ret = single_release(inode, file);
+ 	put_user_ns(ns);
+ 	return ret;
+ }
+ 
+ static const struct file_operations proc_setgroups_operations = {
+ 	.open		= proc_setgroups_open,
+ 	.write		= proc_setgroups_write,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= proc_setgroups_release,
+ };
+ #endif /* CONFIG_USER_NS */
+ 
+ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+ 				struct pid *pid, struct task_struct *task)
+ {
+ 	int err = lock_trace(task);
+ 	if (!err) {
+ 		seq_printf(m, "%08x\n", task->personality);
+ 		unlock_trace(task);
+ 	}
+ 	return err;
+ }
+ 
+ #ifdef CONFIG_LIVEPATCH
+ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
+ 				struct pid *pid, struct task_struct *task)
+ {
+ 	seq_printf(m, "%d\n", task->patch_state);
+ 	return 0;
+ }
+ #endif /* CONFIG_LIVEPATCH */
+ 
+ #ifdef CONFIG_STACKLEAK_METRICS
+ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
+ 				struct pid *pid, struct task_struct *task)
+ {
+ 	unsigned long prev_depth = THREAD_SIZE -
+ 				(task->prev_lowest_stack & (THREAD_SIZE - 1));
+ 	unsigned long depth = THREAD_SIZE -
+ 				(task->lowest_stack & (THREAD_SIZE - 1));
+ 
+ 	seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
+ 							prev_depth, depth);
+ 	return 0;
+ }
+ #endif /* CONFIG_STACKLEAK_METRICS */
+ 
+ /*
+  * Thread groups
+  */
+ static const struct file_operations proc_task_operations;
+ static const struct inode_operations proc_task_inode_operations;
+ 
+ static const struct pid_entry tgid_base_stuff[] = {
+ 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+ 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ 	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+ 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+ #ifdef CONFIG_NET
+ 	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+ #endif
+ 	REG("environ",    S_IRUSR, proc_environ_operations),
+ 	REG("auxv",       S_IRUSR, proc_auxv_operations),
+ 	ONE("status",     S_IRUGO, proc_pid_status),
+ 	ONE("personality", S_IRUSR, proc_pid_personality),
+ 	ONE("limits",	  S_IRUGO, proc_pid_limits),
+ #ifdef CONFIG_SCHED_DEBUG
+ 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+ #endif
+ #ifdef CONFIG_SCHED_AUTOGROUP
+ 	REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
+ #endif
+ 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+ #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ 	ONE("syscall",    S_IRUSR, proc_pid_syscall),
+ #endif
+ 	REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
+ 	ONE("stat",       S_IRUGO, proc_tgid_stat),
+ 	ONE("statm",      S_IRUGO, proc_pid_statm),
+ 	REG("maps",       S_IRUGO, proc_pid_maps_operations),
+ #ifdef CONFIG_NUMA
+ 	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
+ #endif
+ 	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
+ 	LNK("cwd",        proc_cwd_link),
+ 	LNK("root",       proc_root_link),
+ 	LNK("exe",        proc_exe_link),
+ 	REG("mounts",     S_IRUGO, proc_mounts_operations),
+ 	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+ 	REG("mountstats", S_IRUSR, proc_mountstats_operations),
+ #ifdef CONFIG_PROC_PAGE_MONITOR
+ 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+ 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
+ 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
+ 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+ #endif
+ #ifdef CONFIG_SECURITY
+ 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+ #endif
+ #ifdef CONFIG_KALLSYMS
+ 	ONE("wchan",      S_IRUGO, proc_pid_wchan),
+ #endif
+ #ifdef CONFIG_STACKTRACE
+ 	ONE("stack",      S_IRUSR, proc_pid_stack),
+ #endif
+ #ifdef CONFIG_SCHED_INFO
+ 	ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
+ #endif
+ #ifdef CONFIG_LATENCYTOP
+ 	REG("latency",  S_IRUGO, proc_lstats_operations),
+ #endif
+ #ifdef CONFIG_PROC_PID_CPUSET
+ 	ONE("cpuset",     S_IRUGO, proc_cpuset_show),
+ #endif
+ #ifdef CONFIG_CGROUPS
+ 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+ #endif
+ 	ONE("oom_score",  S_IRUGO, proc_oom_score),
+ 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+ 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+ #ifdef CONFIG_AUDIT
+ 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
+ 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+ #endif
+ #ifdef CONFIG_FAULT_INJECTION
+ 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+ 	REG("fail-nth", 0644, proc_fail_nth_operations),
+ #endif
+ #ifdef CONFIG_ELF_CORE
+ 	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
+ #endif
+ #ifdef CONFIG_TASK_IO_ACCOUNTING
+ 	ONE("io",	S_IRUSR, proc_tgid_io_accounting),
+ #endif
+ #ifdef CONFIG_USER_NS
+ 	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
+ #endif
+ #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
+ 	REG("timers",	  S_IRUGO, proc_timers_operations),
+ #endif
+ 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+ #ifdef CONFIG_LIVEPATCH
+ 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
+ #endif
+ #ifdef CONFIG_STACKLEAK_METRICS
+ 	ONE("stack_depth", S_IRUGO, proc_stack_depth),
+ #endif
+ #ifdef CONFIG_PROC_PID_ARCH_STATUS
+ 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+ #endif
+ };
+ 
+ static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	return proc_pident_readdir(file, ctx,
+ 				   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+ }
+ 
+ static const struct file_operations proc_tgid_base_operations = {
+ 	.read		= generic_read_dir,
+ 	.iterate_shared	= proc_tgid_base_readdir,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ struct pid *tgid_pidfd_to_pid(const struct file *file)
+ {
+ 	if (file->f_op != &proc_tgid_base_operations)
+ 		return ERR_PTR(-EBADF);
+ 
+ 	return proc_pid(file_inode(file));
+ }
+ 
+ static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+ {
+ 	return proc_pident_lookup(dir, dentry,
+ 				  tgid_base_stuff,
+ 				  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
+ }
+ 
+ static const struct inode_operations proc_tgid_base_inode_operations = {
+ 	.lookup		= proc_tgid_base_lookup,
+ 	.getattr	= pid_getattr,
+ 	.setattr	= proc_setattr,
+ 	.permission	= proc_pid_permission,
+ };
+ 
+ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
+ {
+ 	struct dentry *dentry, *leader, *dir;
+ 	char buf[10 + 1];
+ 	struct qstr name;
+ 
+ 	name.name = buf;
+ 	name.len = snprintf(buf, sizeof(buf), "%u", pid);
+ 	/* no ->d_hash() rejects on procfs */
+ 	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
+ 	if (dentry) {
+ 		d_invalidate(dentry);
+ 		dput(dentry);
+ 	}
+ 
+ 	if (pid == tgid)
+ 		return;
+ 
+ 	name.name = buf;
+ 	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
+ 	leader = d_hash_and_lookup(mnt->mnt_root, &name);
+ 	if (!leader)
+ 		goto out;
+ 
+ 	name.name = "task";
+ 	name.len = strlen(name.name);
+ 	dir = d_hash_and_lookup(leader, &name);
+ 	if (!dir)
+ 		goto out_put_leader;
+ 
+ 	name.name = buf;
+ 	name.len = snprintf(buf, sizeof(buf), "%u", pid);
+ 	dentry = d_hash_and_lookup(dir, &name);
+ 	if (dentry) {
+ 		d_invalidate(dentry);
+ 		dput(dentry);
+ 	}
+ 
+ 	dput(dir);
+ out_put_leader:
+ 	dput(leader);
+ out:
+ 	return;
+ }
+ 
+ /**
+  * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
+  * @task: task that should be flushed.
+  *
+  * When flushing dentries from proc, one needs to flush them from global
+  * proc (proc_mnt) and from all the namespaces' procs this task was seen
+  * in. This call is supposed to do all of this job.
+  *
+  * Looks in the dcache for
+  * /proc/@pid
+  * /proc/@tgid/task/@pid
+  * if either directory is present flushes it and all of it'ts children
+  * from the dcache.
+  *
+  * It is safe and reasonable to cache /proc entries for a task until
+  * that task exits.  After that they just clog up the dcache with
+  * useless entries, possibly causing useful dcache entries to be
+  * flushed instead.  This routine is proved to flush those useless
+  * dcache entries at process exit time.
+  *
+  * NOTE: This routine is just an optimization so it does not guarantee
+  *       that no dcache entries will exist at process exit time it
+  *       just makes it very unlikely that any will persist.
+  */
+ 
+ void proc_flush_task(struct task_struct *task)
+ {
+ 	int i;
+ 	struct pid *pid, *tgid;
+ 	struct upid *upid;
+ 
+ 	pid = task_pid(task);
+ 	tgid = task_tgid(task);
+ 
+ 	for (i = 0; i <= pid->level; i++) {
+ 		upid = &pid->numbers[i];
+ 		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
+ 					tgid->numbers[i].nr);
+ 	}
+ }
+ 
+ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
+ 				   struct task_struct *task, const void *ptr)
+ {
+ 	struct inode *inode;
+ 
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IXUSR);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IRGRP | S_IXUSR | S_IXGRP);
+ #else
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ #endif
+ 	if (!inode)
+ 		return ERR_PTR(-ENOENT);
+ 
+ 	inode->i_op = &proc_tgid_base_inode_operations;
+ 	inode->i_fop = &proc_tgid_base_operations;
+ 	inode->i_flags|=S_IMMUTABLE;
+ 
+ 	set_nlink(inode, nlink_tgid);
+ 	pid_update_inode(task, inode);
+ 
+ 	d_set_d_op(dentry, &pid_dentry_operations);
+ 	return d_splice_alias(inode, dentry);
+ }
+ 
+ struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
+ {
+ 	struct task_struct *task;
+ 	unsigned tgid;
+ 	struct pid_namespace *ns;
+ 	struct dentry *result = ERR_PTR(-ENOENT);
+ 
+ 	tgid = name_to_int(&dentry->d_name);
+ 	if (tgid == ~0U)
+ 		goto out;
+ 
+ 	ns = dentry->d_sb->s_fs_info;
+ 	rcu_read_lock();
+ 	task = find_task_by_pid_ns(tgid, ns);
+ 	if (task)
+ 		get_task_struct(task);
+ 	rcu_read_unlock();
+ 	if (!task)
+ 		goto out;
+ 
+ 	result = proc_pid_instantiate(dentry, task, NULL);
+ 	put_task_struct(task);
+ out:
+ 	return result;
+ }
+ 
+ /*
+  * Find the first task with tgid >= tgid
+  *
+  */
+ struct tgid_iter {
+ 	unsigned int tgid;
+ 	struct task_struct *task;
+ };
+ static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
+ {
+ 	struct pid *pid;
+ 
+ 	if (iter.task)
+ 		put_task_struct(iter.task);
+ 	rcu_read_lock();
+ retry:
+ 	iter.task = NULL;
+ 	pid = find_ge_pid(iter.tgid, ns);
+ 	if (pid) {
+ 		iter.tgid = pid_nr_ns(pid, ns);
+ 		iter.task = pid_task(pid, PIDTYPE_PID);
+ 		/* What we to know is if the pid we have find is the
+ 		 * pid of a thread_group_leader.  Testing for task
+ 		 * being a thread_group_leader is the obvious thing
+ 		 * todo but there is a window when it fails, due to
+ 		 * the pid transfer logic in de_thread.
+ 		 *
+ 		 * So we perform the straight forward test of seeing
+ 		 * if the pid we have found is the pid of a thread
+ 		 * group leader, and don't worry if the task we have
+ 		 * found doesn't happen to be a thread group leader.
+ 		 * As we don't care in the case of readdir.
+ 		 */
+ 		if (!iter.task || !has_group_leader_pid(iter.task)) {
+ 			iter.tgid += 1;
+ 			goto retry;
+ 		}
+ 		get_task_struct(iter.task);
+ 	}
+ 	rcu_read_unlock();
+ 	return iter;
+ }
+ 
+ #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
+ 
+ /* for the /proc/ directory itself, after non-process stuff has been done */
+ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	struct tgid_iter iter;
+ 	struct pid_namespace *ns = proc_pid_ns(file_inode(file));
+ 	loff_t pos = ctx->pos;
+ 
+ 	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
+ 		return 0;
+ 
+ 	if (pos == TGID_OFFSET - 2) {
+ 		struct inode *inode = d_inode(ns->proc_self);
+ 		if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
+ 			return 0;
+ 		ctx->pos = pos = pos + 1;
+ 	}
+ 	if (pos == TGID_OFFSET - 1) {
+ 		struct inode *inode = d_inode(ns->proc_thread_self);
+ 		if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+ 			return 0;
+ 		ctx->pos = pos = pos + 1;
+ 	}
+ 	iter.tgid = pos - TGID_OFFSET;
+ 	iter.task = NULL;
+ 	for (iter = next_tgid(ns, iter);
+ 	     iter.task;
+ 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+ 		char name[10 + 1];
+ 		unsigned int len;
+ 
+ 		cond_resched();
+ 		if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
+ 			continue;
+ 
+ 		len = snprintf(name, sizeof(name), "%u", iter.tgid);
+ 		ctx->pos = iter.tgid + TGID_OFFSET;
+ 		if (!proc_fill_cache(file, ctx, name, len,
+ 				     proc_pid_instantiate, iter.task, NULL)) {
+ 			put_task_struct(iter.task);
+ 			return 0;
+ 		}
+ 	}
+ 	ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
+ 	return 0;
+ }
+ 
+ /*
+  * proc_tid_comm_permission is a special permission function exclusively
+  * used for the node /proc/<pid>/task/<tid>/comm.
+  * It bypasses generic permission checks in the case where a task of the same
+  * task group attempts to access the node.
+  * The rationale behind this is that glibc and bionic access this node for
+  * cross thread naming (pthread_set/getname_np(!self)). However, if
+  * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+  * which locks out the cross thread naming implementation.
+  * This function makes sure that the node is always accessible for members of
+  * same thread group.
+  */
+ static int proc_tid_comm_permission(struct inode *inode, int mask)
+ {
+ 	bool is_same_tgroup;
+ 	struct task_struct *task;
+ 
+ 	task = get_proc_task(inode);
+ 	if (!task)
+ 		return -ESRCH;
+ 	is_same_tgroup = same_thread_group(current, task);
+ 	put_task_struct(task);
+ 
+ 	if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ 		/* This file (/proc/<pid>/task/<tid>/comm) can always be
+ 		 * read or written by the members of the corresponding
+ 		 * thread group.
+ 		 */
+ 		return 0;
+ 	}
+ 
+ 	return generic_permission(inode, mask);
+ }
+ 
+ static const struct inode_operations proc_tid_comm_inode_operations = {
+ 		.permission = proc_tid_comm_permission,
+ };
+ 
+ /*
+  * Tasks
+  */
+ static const struct pid_entry tid_base_stuff[] = {
+ 	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ 	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ 	DIR("ns",	 S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+ #ifdef CONFIG_NET
+ 	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+ #endif
+ 	REG("environ",   S_IRUSR, proc_environ_operations),
+ 	REG("auxv",      S_IRUSR, proc_auxv_operations),
+ 	ONE("status",    S_IRUGO, proc_pid_status),
+ 	ONE("personality", S_IRUSR, proc_pid_personality),
+ 	ONE("limits",	 S_IRUGO, proc_pid_limits),
+ #ifdef CONFIG_SCHED_DEBUG
+ 	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+ #endif
+ 	NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
+ 			 &proc_tid_comm_inode_operations,
+ 			 &proc_pid_set_comm_operations, {}),
+ #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ 	ONE("syscall",   S_IRUSR, proc_pid_syscall),
+ #endif
+ 	REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
+ 	ONE("stat",      S_IRUGO, proc_tid_stat),
+ 	ONE("statm",     S_IRUGO, proc_pid_statm),
+ 	REG("maps",      S_IRUGO, proc_pid_maps_operations),
+ #ifdef CONFIG_PROC_CHILDREN
+ 	REG("children",  S_IRUGO, proc_tid_children_operations),
+ #endif
+ #ifdef CONFIG_NUMA
+ 	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
+ #endif
+ 	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
+ 	LNK("cwd",       proc_cwd_link),
+ 	LNK("root",      proc_root_link),
+ 	LNK("exe",       proc_exe_link),
+ 	REG("mounts",    S_IRUGO, proc_mounts_operations),
+ 	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+ #ifdef CONFIG_PROC_PAGE_MONITOR
+ 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+ 	REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
+ 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
+ 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+ #endif
+ #ifdef CONFIG_SECURITY
+ 	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
+ #endif
+ #ifdef CONFIG_KALLSYMS
+ 	ONE("wchan",     S_IRUGO, proc_pid_wchan),
+ #endif
+ #ifdef CONFIG_STACKTRACE
+ 	ONE("stack",      S_IRUSR, proc_pid_stack),
+ #endif
+ #ifdef CONFIG_SCHED_INFO
+ 	ONE("schedstat", S_IRUGO, proc_pid_schedstat),
+ #endif
+ #ifdef CONFIG_LATENCYTOP
+ 	REG("latency",  S_IRUGO, proc_lstats_operations),
+ #endif
+ #ifdef CONFIG_PROC_PID_CPUSET
+ 	ONE("cpuset",    S_IRUGO, proc_cpuset_show),
+ #endif
+ #ifdef CONFIG_CGROUPS
+ 	ONE("cgroup",  S_IRUGO, proc_cgroup_show),
+ #endif
+ 	ONE("oom_score", S_IRUGO, proc_oom_score),
+ 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
+ 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+ #ifdef CONFIG_AUDIT
+ 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
+ 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+ #endif
+ #ifdef CONFIG_FAULT_INJECTION
+ 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+ 	REG("fail-nth", 0644, proc_fail_nth_operations),
+ #endif
+ #ifdef CONFIG_TASK_IO_ACCOUNTING
+ 	ONE("io",	S_IRUSR, proc_tid_io_accounting),
+ #endif
+ #ifdef CONFIG_USER_NS
+ 	REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
+ 	REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
+ 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
+ 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
+ #endif
+ #ifdef CONFIG_LIVEPATCH
+ 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
+ #endif
+ #ifdef CONFIG_PROC_PID_ARCH_STATUS
+ 	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+ #endif
+ };
+ 
+ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	return proc_pident_readdir(file, ctx,
+ 				   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ }
+ 
+ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+ {
+ 	return proc_pident_lookup(dir, dentry,
+ 				  tid_base_stuff,
+ 				  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
+ }
+ 
+ static const struct file_operations proc_tid_base_operations = {
+ 	.read		= generic_read_dir,
+ 	.iterate_shared	= proc_tid_base_readdir,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ static const struct inode_operations proc_tid_base_inode_operations = {
+ 	.lookup		= proc_tid_base_lookup,
+ 	.getattr	= pid_getattr,
+ 	.setattr	= proc_setattr,
+ };
+ 
+ static struct dentry *proc_task_instantiate(struct dentry *dentry,
+ 	struct task_struct *task, const void *ptr)
+ {
+ 	struct inode *inode;
+ 
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IXUSR);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUSR | S_IRGRP | S_IXUSR | S_IXGRP);
+ #else
+ 	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
+ #endif
+ 	if (!inode)
+ 		return ERR_PTR(-ENOENT);
+ 
+ 	inode->i_op = &proc_tid_base_inode_operations;
+ 	inode->i_fop = &proc_tid_base_operations;
+ 	inode->i_flags |= S_IMMUTABLE;
+ 
+ 	set_nlink(inode, nlink_tid);
+ 	pid_update_inode(task, inode);
+ 
+ 	d_set_d_op(dentry, &pid_dentry_operations);
+ 	return d_splice_alias(inode, dentry);
+ }
+ 
+ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+ {
+ 	struct task_struct *task;
+ 	struct task_struct *leader = get_proc_task(dir);
+ 	unsigned tid;
+ 	struct pid_namespace *ns;
+ 	struct dentry *result = ERR_PTR(-ENOENT);
+ 
+ 	if (!leader)
+ 		goto out_no_task;
+ 
+ 	tid = name_to_int(&dentry->d_name);
+ 	if (tid == ~0U)
+ 		goto out;
+ 
+ 	ns = dentry->d_sb->s_fs_info;
+ 	rcu_read_lock();
+ 	task = find_task_by_pid_ns(tid, ns);
+ 	if (task)
+ 		get_task_struct(task);
+ 	rcu_read_unlock();
+ 	if (!task)
+ 		goto out;
+ 	if (!same_thread_group(leader, task))
+ 		goto out_drop_task;
+ 
+ 	result = proc_task_instantiate(dentry, task, NULL);
+ out_drop_task:
+ 	put_task_struct(task);
+ out:
+ 	put_task_struct(leader);
+ out_no_task:
+ 	return result;
+ }
+ 
+ /*
+  * Find the first tid of a thread group to return to user space.
+  *
+  * Usually this is just the thread group leader, but if the users
+  * buffer was too small or there was a seek into the middle of the
+  * directory we have more work todo.
+  *
+  * In the case of a short read we start with find_task_by_pid.
+  *
+  * In the case of a seek we start with the leader and walk nr
+  * threads past it.
+  */
+ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+ 					struct pid_namespace *ns)
+ {
+ 	struct task_struct *pos, *task;
+ 	unsigned long nr = f_pos;
+ 
+ 	if (nr != f_pos)	/* 32bit overflow? */
+ 		return NULL;
+ 
+ 	rcu_read_lock();
+ 	task = pid_task(pid, PIDTYPE_PID);
+ 	if (!task)
+ 		goto fail;
+ 
+ 	/* Attempt to start with the tid of a thread */
+ 	if (tid && nr) {
+ 		pos = find_task_by_pid_ns(tid, ns);
+ 		if (pos && same_thread_group(pos, task))
+ 			goto found;
+ 	}
+ 
+ 	/* If nr exceeds the number of threads there is nothing todo */
+ 	if (nr >= get_nr_threads(task))
+ 		goto fail;
+ 
+ 	/* If we haven't found our starting place yet start
+ 	 * with the leader and walk nr threads forward.
+ 	 */
+ 	pos = task = task->group_leader;
+ 	do {
+ 		if (!nr--)
+ 			goto found;
+ 	} while_each_thread(task, pos);
+ fail:
+ 	pos = NULL;
+ 	goto out;
+ found:
+ 	get_task_struct(pos);
+ out:
+ 	rcu_read_unlock();
+ 	return pos;
+ }
+ 
+ /*
+  * Find the next thread in the thread list.
+  * Return NULL if there is an error or no next thread.
+  *
+  * The reference to the input task_struct is released.
+  */
+ static struct task_struct *next_tid(struct task_struct *start)
+ {
+ 	struct task_struct *pos = NULL;
+ 	rcu_read_lock();
+ 	if (pid_alive(start)) {
+ 		pos = next_thread(start);
+ 		if (thread_group_leader(pos))
+ 			pos = NULL;
+ 		else
+ 			get_task_struct(pos);
+ 	}
+ 	rcu_read_unlock();
+ 	put_task_struct(start);
+ 	return pos;
+ }
+ 
+ /* for the /proc/TGID/task/ directories */
+ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	struct inode *inode = file_inode(file);
+ 	struct task_struct *task;
+ 	struct pid_namespace *ns;
+ 	int tid;
+ 
+ 	if (proc_inode_is_dead(inode))
+ 		return -ENOENT;
+ 
+ 	if (!dir_emit_dots(file, ctx))
+ 		return 0;
+ 
+ 	/* f_version caches the tgid value that the last readdir call couldn't
+ 	 * return. lseek aka telldir automagically resets f_version to 0.
+ 	 */
+ 	ns = proc_pid_ns(inode);
+ 	tid = (int)file->f_version;
+ 	file->f_version = 0;
+ 	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
+ 	     task;
+ 	     task = next_tid(task), ctx->pos++) {
+ 		char name[10 + 1];
+ 		unsigned int len;
+ 		tid = task_pid_nr_ns(task, ns);
+ 		len = snprintf(name, sizeof(name), "%u", tid);
+ 		if (!proc_fill_cache(file, ctx, name, len,
+ 				proc_task_instantiate, task, NULL)) {
+ 			/* returning this tgid failed, save it as the first
+ 			 * pid for the next readir call */
+ 			file->f_version = (u64)tid;
+ 			put_task_struct(task);
+ 			break;
+ 		}
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static int proc_task_getattr(const struct path *path, struct kstat *stat,
+ 			     u32 request_mask, unsigned int query_flags)
+ {
+ 	struct inode *inode = d_inode(path->dentry);
+ 	struct task_struct *p = get_proc_task(inode);
+ 	generic_fillattr(inode, stat);
+ 
+ 	if (p) {
+ 		stat->nlink += get_nr_threads(p);
+ 		put_task_struct(p);
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static const struct inode_operations proc_task_inode_operations = {
+ 	.lookup		= proc_task_lookup,
+ 	.getattr	= proc_task_getattr,
+ 	.setattr	= proc_setattr,
+ 	.permission	= proc_pid_permission,
+ };
+ 
+ static const struct file_operations proc_task_operations = {
+ 	.read		= generic_read_dir,
+ 	.iterate_shared	= proc_task_readdir,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ void __init set_proc_pid_nlink(void)
+ {
+ 	nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ 	nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+ }
diff --color -rcNP Master/fs/proc/base.c.rej OG/fs/proc/base.c.rej
*** Master/fs/proc/base.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/proc/base.c.rej	2021-04-20 15:11:27.317000000 -0400
***************
*** 0 ****
--- 1,36 ----
+ *** fs/proc/base.c	2021-03-13 14:56:55.000000000 +0200
+ --- fs/proc/base.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 2472,2478 ****
+   	return d_splice_alias(inode, dentry);
+   }
+   
+ ! static struct dentry *proc_pident_lookup(struct inode *dir,
+   					 struct dentry *dentry,
+   					 const struct pid_entry *p,
+   					 const struct pid_entry *end)
+ --- 2439,2445 ----
+   	return d_splice_alias(inode, dentry);
+   }
+   
+ ! static struct dentry *proc_pident_lookup(struct inode *dir,
+   					 struct dentry *dentry,
+   					 const struct pid_entry *p,
+   					 const struct pid_entry *end)
+ ***************
+ *** 2659,2665 ****
+   
+   static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
+   {
+ ! 	return proc_pident_readdir(file, ctx,
+   				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+   }
+   
+ --- 2620,2626 ----
+   
+   static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
+   {
+ ! 	return proc_pident_readdir(file, ctx,
+   				   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+   }
+   
diff --color -rcNP Master/fs/proc/inode.c OG/fs/proc/inode.c
*** Master/fs/proc/inode.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/inode.c	2021-04-20 15:11:34.512000000 -0400
***************
*** 24,29 ****
--- 24,30 ----
  #include <linux/seq_file.h>
  #include <linux/slab.h>
  #include <linux/mount.h>
+ #include <linux/minisec.h>
  
  #include <linux/uaccess.h>
  
***************
*** 462,468 ****
  		if (de->mode) {
  			inode->i_mode = de->mode;
  			inode->i_uid = de->uid;
! 			inode->i_gid = de->gid;
  		}
  		if (de->size)
  			inode->i_size = de->size;
--- 463,473 ----
  		if (de->mode) {
  			inode->i_mode = de->mode;
  			inode->i_uid = de->uid;
! #ifdef CONFIG_MINISEC_PROC_USERGROUP
! 			inode->i_gid = grsec_proc_gid;
! #else
!  			inode->i_gid = de->gid;
! #endif
  		}
  		if (de->size)
  			inode->i_size = de->size;
diff --color -rcNP Master/fs/proc/kcore.c OG/fs/proc/kcore.c
*** Master/fs/proc/kcore.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/kcore.c	2021-04-20 15:11:34.512000000 -0400
***************
*** 548,553 ****
--- 548,557 ----
  {
  	int ret = security_locked_down(LOCKDOWN_KCORE);
  
+ #if defined(CONFIG_MINISEC_PROC_ADD) || defined(CONFIG_MINISEC_HIDESYM)
+ 	return -EPERM;
+ #endif
+ 
  	if (!capable(CAP_SYS_RAWIO))
  		return -EPERM;
  
diff --color -rcNP Master/fs/proc/proc_sysctl.c OG/fs/proc/proc_sysctl.c
*** Master/fs/proc/proc_sysctl.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/proc_sysctl.c	2021-04-20 15:11:34.512000000 -0400
***************
*** 14,21 ****
--- 14,30 ----
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/bpf-cgroup.h>
+ #include <linux/minisec.h>
+ #include <linux/nsproxy.h>
+ #ifdef CONFIG_MINISEC
+ #include <net/net_namespace.h>
+ #endif
  #include "internal.h"
  
+ extern int gr_handle_chroot_sysctl(const int op);
+ extern int gr_handle_sysctl_mod(const char *dirname, const char *name,
+ 				const int op);
+ 
  static const struct dentry_operations proc_sys_dentry_operations;
  static const struct file_operations proc_sys_file_operations;
  static const struct inode_operations proc_sys_inode_operations;
***************
*** 579,584 ****
--- 588,594 ----
  	struct ctl_table_header *head = grab_header(inode);
  	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
  	void *new_buf = NULL;
+ 	int op = write ? MAY_WRITE : MAY_READ;
  	ssize_t error;
  
  	if (IS_ERR(head))
***************
*** 589,595 ****
  	 * and won't be until we finish.
  	 */
  	error = -EPERM;
! 	if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
  		goto out;
  
  	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
--- 599,605 ----
  	 * and won't be until we finish.
  	 */
  	error = -EPERM;
! 	if (sysctl_perm(head, table, op))
  		goto out;
  
  	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
***************
*** 602,607 ****
--- 612,633 ----
  	if (error)
  		goto out;
  
+ #ifdef CONFIG_MINISEC
+ 	error = -EPERM;
+ 	if (gr_handle_chroot_sysctl(op))
+ 		goto out;
+ 	if (gr_handle_sysctl_mod(NULL, table->procname, op)) {
+ 		goto out;
+ 	}
+ 	if (write) {
+ 		if (current->nsproxy->net_ns != table->extra2) {
+ 			if (!capable(CAP_SYS_ADMIN))
+ 				goto out;
+ 		} else if (!ns_capable(current->nsproxy->net_ns->user_ns, CAP_NET_ADMIN))
+ 			goto out;
+ 	}
+ #endif
+ 
  	/* careful: calling conventions are nasty here */
  	if (new_buf) {
  		mm_segment_t old_fs;
diff --color -rcNP Master/fs/proc/root.c OG/fs/proc/root.c
*** Master/fs/proc/root.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/root.c	2021-04-20 15:11:34.512000000 -0400
***************
*** 118,124 ****
  	 * top of it
  	 */
  	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
! 	
  	/* procfs dentries and inodes don't require IO to create */
  	s->s_shrink.seeks = 0;
  
--- 118,124 ----
  	 * top of it
  	 */
  	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
! 
  	/* procfs dentries and inodes don't require IO to create */
  	s->s_shrink.seeks = 0;
  
***************
*** 229,235 ****
--- 229,243 ----
  	proc_create_mount_point("openprom");
  #endif
  	proc_tty_init();
+ #ifdef CONFIG_MINISEC_PROC_ADD
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	proc_mkdir_mode("bus", S_IRUSR | S_IXUSR, NULL);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	proc_mkdir_mode("bus", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL);
+ #endif
+ #else
  	proc_mkdir("bus", NULL);
+ #endif
  	proc_sys_init();
  
  	register_filesystem(&proc_fs_type);
***************
*** 286,297 ****
   * This is the root "inode" in the /proc tree..
   */
  struct proc_dir_entry proc_root = {
! 	.low_ino	= PROC_ROOT_INO, 
! 	.namelen	= 5, 
! 	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
! 	.nlink		= 2, 
  	.refcnt		= REFCOUNT_INIT(1),
! 	.proc_iops	= &proc_root_inode_operations, 
  	.proc_fops	= &proc_root_operations,
  	.parent		= &proc_root,
  	.subdir		= RB_ROOT,
--- 294,305 ----
   * This is the root "inode" in the /proc tree..
   */
  struct proc_dir_entry proc_root = {
! 	.low_ino	= PROC_ROOT_INO,
! 	.namelen	= 5,
! 	.mode		= S_IFDIR | S_IRUGO | S_IXUGO,
! 	.nlink		= 2,
  	.refcnt		= REFCOUNT_INIT(1),
! 	.proc_iops	= &proc_root_inode_operations,
  	.proc_fops	= &proc_root_operations,
  	.parent		= &proc_root,
  	.subdir		= RB_ROOT,
diff --color -rcNP Master/fs/proc/root.c.orig OG/fs/proc/root.c.orig
*** Master/fs/proc/root.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/proc/root.c.orig	2021-04-20 15:10:45.383000000 -0400
***************
*** 0 ****
--- 1,343 ----
+ // SPDX-License-Identifier: GPL-2.0
+ /*
+  *  linux/fs/proc/root.c
+  *
+  *  Copyright (C) 1991, 1992 Linus Torvalds
+  *
+  *  proc root directory handling functions
+  */
+ 
+ #include <linux/uaccess.h>
+ 
+ #include <linux/errno.h>
+ #include <linux/time.h>
+ #include <linux/proc_fs.h>
+ #include <linux/stat.h>
+ #include <linux/init.h>
+ #include <linux/sched.h>
+ #include <linux/sched/stat.h>
+ #include <linux/module.h>
+ #include <linux/bitops.h>
+ #include <linux/user_namespace.h>
+ #include <linux/fs_context.h>
+ #include <linux/mount.h>
+ #include <linux/pid_namespace.h>
+ #include <linux/fs_parser.h>
+ #include <linux/cred.h>
+ #include <linux/magic.h>
+ #include <linux/slab.h>
+ 
+ #include "internal.h"
+ 
+ struct proc_fs_context {
+ 	struct pid_namespace	*pid_ns;
+ 	unsigned int		mask;
+ 	int			hidepid;
+ 	int			gid;
+ };
+ 
+ enum proc_param {
+ 	Opt_gid,
+ 	Opt_hidepid,
+ };
+ 
+ static const struct fs_parameter_spec proc_param_specs[] = {
+ 	fsparam_u32("gid",	Opt_gid),
+ 	fsparam_u32("hidepid",	Opt_hidepid),
+ 	{}
+ };
+ 
+ static const struct fs_parameter_description proc_fs_parameters = {
+ 	.name		= "proc",
+ 	.specs		= proc_param_specs,
+ };
+ 
+ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
+ {
+ 	struct proc_fs_context *ctx = fc->fs_private;
+ 	struct fs_parse_result result;
+ 	int opt;
+ 
+ 	opt = fs_parse(fc, &proc_fs_parameters, param, &result);
+ 	if (opt < 0)
+ 		return opt;
+ 
+ 	switch (opt) {
+ 	case Opt_gid:
+ 		ctx->gid = result.uint_32;
+ 		break;
+ 
+ 	case Opt_hidepid:
+ 		ctx->hidepid = result.uint_32;
+ 		if (ctx->hidepid < HIDEPID_OFF ||
+ 		    ctx->hidepid > HIDEPID_INVISIBLE)
+ 			return invalf(fc, "proc: hidepid value must be between 0 and 2.\n");
+ 		break;
+ 
+ 	default:
+ 		return -EINVAL;
+ 	}
+ 
+ 	ctx->mask |= 1 << opt;
+ 	return 0;
+ }
+ 
+ static void proc_apply_options(struct super_block *s,
+ 			       struct fs_context *fc,
+ 			       struct pid_namespace *pid_ns,
+ 			       struct user_namespace *user_ns)
+ {
+ 	struct proc_fs_context *ctx = fc->fs_private;
+ 
+ 	if (ctx->mask & (1 << Opt_gid))
+ 		pid_ns->pid_gid = make_kgid(user_ns, ctx->gid);
+ 	if (ctx->mask & (1 << Opt_hidepid))
+ 		pid_ns->hide_pid = ctx->hidepid;
+ }
+ 
+ static int proc_fill_super(struct super_block *s, struct fs_context *fc)
+ {
+ 	struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info);
+ 	struct inode *root_inode;
+ 	int ret;
+ 
+ 	proc_apply_options(s, fc, pid_ns, current_user_ns());
+ 
+ 	/* User space would break if executables or devices appear on proc */
+ 	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
+ 	s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC;
+ 	s->s_blocksize = 1024;
+ 	s->s_blocksize_bits = 10;
+ 	s->s_magic = PROC_SUPER_MAGIC;
+ 	s->s_op = &proc_sops;
+ 	s->s_time_gran = 1;
+ 
+ 	/*
+ 	 * procfs isn't actually a stacking filesystem; however, there is
+ 	 * too much magic going on inside it to permit stacking things on
+ 	 * top of it
+ 	 */
+ 	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+ 	
+ 	/* procfs dentries and inodes don't require IO to create */
+ 	s->s_shrink.seeks = 0;
+ 
+ 	pde_get(&proc_root);
+ 	root_inode = proc_get_inode(s, &proc_root);
+ 	if (!root_inode) {
+ 		pr_err("proc_fill_super: get root inode failed\n");
+ 		return -ENOMEM;
+ 	}
+ 
+ 	s->s_root = d_make_root(root_inode);
+ 	if (!s->s_root) {
+ 		pr_err("proc_fill_super: allocate dentry failed\n");
+ 		return -ENOMEM;
+ 	}
+ 
+ 	ret = proc_setup_self(s);
+ 	if (ret) {
+ 		return ret;
+ 	}
+ 	return proc_setup_thread_self(s);
+ }
+ 
+ static int proc_reconfigure(struct fs_context *fc)
+ {
+ 	struct super_block *sb = fc->root->d_sb;
+ 	struct pid_namespace *pid = sb->s_fs_info;
+ 
+ 	sync_filesystem(sb);
+ 
+ 	proc_apply_options(sb, fc, pid, current_user_ns());
+ 	return 0;
+ }
+ 
+ static int proc_get_tree(struct fs_context *fc)
+ {
+ 	struct proc_fs_context *ctx = fc->fs_private;
+ 
+ 	return get_tree_keyed(fc, proc_fill_super, ctx->pid_ns);
+ }
+ 
+ static void proc_fs_context_free(struct fs_context *fc)
+ {
+ 	struct proc_fs_context *ctx = fc->fs_private;
+ 
+ 	put_pid_ns(ctx->pid_ns);
+ 	kfree(ctx);
+ }
+ 
+ static const struct fs_context_operations proc_fs_context_ops = {
+ 	.free		= proc_fs_context_free,
+ 	.parse_param	= proc_parse_param,
+ 	.get_tree	= proc_get_tree,
+ 	.reconfigure	= proc_reconfigure,
+ };
+ 
+ static int proc_init_fs_context(struct fs_context *fc)
+ {
+ 	struct proc_fs_context *ctx;
+ 
+ 	ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL);
+ 	if (!ctx)
+ 		return -ENOMEM;
+ 
+ 	ctx->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ 	put_user_ns(fc->user_ns);
+ 	fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ 	fc->fs_private = ctx;
+ 	fc->ops = &proc_fs_context_ops;
+ 	return 0;
+ }
+ 
+ static void proc_kill_sb(struct super_block *sb)
+ {
+ 	struct pid_namespace *ns;
+ 
+ 	ns = (struct pid_namespace *)sb->s_fs_info;
+ 	if (ns->proc_self)
+ 		dput(ns->proc_self);
+ 	if (ns->proc_thread_self)
+ 		dput(ns->proc_thread_self);
+ 	kill_anon_super(sb);
+ 	put_pid_ns(ns);
+ }
+ 
+ static struct file_system_type proc_fs_type = {
+ 	.name			= "proc",
+ 	.init_fs_context	= proc_init_fs_context,
+ 	.parameters		= &proc_fs_parameters,
+ 	.kill_sb		= proc_kill_sb,
+ 	.fs_flags		= FS_USERNS_MOUNT | FS_DISALLOW_NOTIFY_PERM,
+ };
+ 
+ void __init proc_root_init(void)
+ {
+ 	proc_init_kmemcache();
+ 	set_proc_pid_nlink();
+ 	proc_self_init();
+ 	proc_thread_self_init();
+ 	proc_symlink("mounts", NULL, "self/mounts");
+ 
+ 	proc_net_init();
+ 	proc_mkdir("fs", NULL);
+ 	proc_mkdir("driver", NULL);
+ 	proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
+ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
+ 	/* just give it a mountpoint */
+ 	proc_create_mount_point("openprom");
+ #endif
+ 	proc_tty_init();
+ #ifdef CONFIG_MINISEC_PROC_ADD
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	proc_mkdir_mode("bus", S_IRUSR | S_IXUSR, NULL);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	proc_mkdir_mode("bus", S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP, NULL);
+ #endif
+ #else
+ 	proc_mkdir("bus", NULL);
+ #endif
+ 	proc_sys_init();
+ 
+ 	register_filesystem(&proc_fs_type);
+ }
+ 
+ static int proc_root_getattr(const struct path *path, struct kstat *stat,
+ 			     u32 request_mask, unsigned int query_flags)
+ {
+ 	generic_fillattr(d_inode(path->dentry), stat);
+ 	stat->nlink = proc_root.nlink + nr_processes();
+ 	return 0;
+ }
+ 
+ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
+ {
+ 	if (!proc_pid_lookup(dentry, flags))
+ 		return NULL;
+ 
+ 	return proc_lookup(dir, dentry, flags);
+ }
+ 
+ static int proc_root_readdir(struct file *file, struct dir_context *ctx)
+ {
+ 	if (ctx->pos < FIRST_PROCESS_ENTRY) {
+ 		int error = proc_readdir(file, ctx);
+ 		if (unlikely(error <= 0))
+ 			return error;
+ 		ctx->pos = FIRST_PROCESS_ENTRY;
+ 	}
+ 
+ 	return proc_pid_readdir(file, ctx);
+ }
+ 
+ /*
+  * The root /proc directory is special, as it has the
+  * <pid> directories. Thus we don't use the generic
+  * directory handling functions for that..
+  */
+ static const struct file_operations proc_root_operations = {
+ 	.read		 = generic_read_dir,
+ 	.iterate_shared	 = proc_root_readdir,
+ 	.llseek		= generic_file_llseek,
+ };
+ 
+ /*
+  * proc root can do almost nothing..
+  */
+ static const struct inode_operations proc_root_inode_operations = {
+ 	.lookup		= proc_root_lookup,
+ 	.getattr	= proc_root_getattr,
+ };
+ 
+ /*
+  * This is the root "inode" in the /proc tree..
+  */
+ struct proc_dir_entry proc_root = {
+ 	.low_ino	= PROC_ROOT_INO, 
+ 	.namelen	= 5, 
+ 	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
+ 	.nlink		= 2, 
+ 	.refcnt		= REFCOUNT_INIT(1),
+ 	.proc_iops	= &proc_root_inode_operations, 
+ 	.proc_fops	= &proc_root_operations,
+ 	.parent		= &proc_root,
+ 	.subdir		= RB_ROOT,
+ 	.name		= "/proc",
+ };
+ 
+ int pid_ns_prepare_proc(struct pid_namespace *ns)
+ {
+ 	struct proc_fs_context *ctx;
+ 	struct fs_context *fc;
+ 	struct vfsmount *mnt;
+ 
+ 	fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
+ 	if (IS_ERR(fc))
+ 		return PTR_ERR(fc);
+ 
+ 	if (fc->user_ns != ns->user_ns) {
+ 		put_user_ns(fc->user_ns);
+ 		fc->user_ns = get_user_ns(ns->user_ns);
+ 	}
+ 
+ 	ctx = fc->fs_private;
+ 	if (ctx->pid_ns != ns) {
+ 		put_pid_ns(ctx->pid_ns);
+ 		get_pid_ns(ns);
+ 		ctx->pid_ns = ns;
+ 	}
+ 
+ 	mnt = fc_mount(fc);
+ 	put_fs_context(fc);
+ 	if (IS_ERR(mnt))
+ 		return PTR_ERR(mnt);
+ 
+ 	ns->proc_mnt = mnt;
+ 	return 0;
+ }
+ 
+ void pid_ns_release_proc(struct pid_namespace *ns)
+ {
+ 	kern_unmount(ns->proc_mnt);
+ }
diff --color -rcNP Master/fs/proc/root.c.rej OG/fs/proc/root.c.rej
*** Master/fs/proc/root.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/fs/proc/root.c.rej	2021-04-20 15:11:27.318000000 -0400
***************
*** 0 ****
--- 1,46 ----
+ *** fs/proc/root.c	2021-03-13 15:05:18.000000000 +0200
+ --- fs/proc/root.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 118,124 ****
+   	 * top of it
+   	 */
+   	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+ ! 
+   	/* procfs dentries and inodes don't require IO to create */
+   	s->s_shrink.seeks = 0;
+   
+ --- 118,124 ----
+   	 * top of it
+   	 */
+   	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
+ ! 
+   	/* procfs dentries and inodes don't require IO to create */
+   	s->s_shrink.seeks = 0;
+   
+ ***************
+ *** 286,297 ****
+    * This is the root "inode" in the /proc tree..
+    */
+   struct proc_dir_entry proc_root = {
+ ! 	.low_ino	= PROC_ROOT_INO,
+ ! 	.namelen	= 5,
+ ! 	.mode		= S_IFDIR | S_IRUGO | S_IXUGO,
+ ! 	.nlink		= 2,
+   	.refcnt		= REFCOUNT_INIT(1),
+ ! 	.proc_iops	= &proc_root_inode_operations,
+   	.proc_fops	= &proc_root_operations,
+   	.parent		= &proc_root,
+   	.subdir		= RB_ROOT,
+ --- 278,289 ----
+    * This is the root "inode" in the /proc tree..
+    */
+   struct proc_dir_entry proc_root = {
+ ! 	.low_ino	= PROC_ROOT_INO,
+ ! 	.namelen	= 5,
+ ! 	.mode		= S_IFDIR | S_IRUGO | S_IXUGO,
+ ! 	.nlink		= 2,
+   	.refcnt		= REFCOUNT_INIT(1),
+ ! 	.proc_iops	= &proc_root_inode_operations,
+   	.proc_fops	= &proc_root_operations,
+   	.parent		= &proc_root,
+   	.subdir		= RB_ROOT,
diff --color -rcNP Master/fs/proc/stat.c OG/fs/proc/stat.c
*** Master/fs/proc/stat.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/proc/stat.c	2021-04-20 15:11:34.512000000 -0400
***************
*** 13,18 ****
--- 13,19 ----
  #include <linux/irqnr.h>
  #include <linux/sched/cputime.h>
  #include <linux/tick.h>
+ #include <linux/minisec.h>
  
  #ifndef arch_irq_stat_cpu
  #define arch_irq_stat_cpu(cpu) 0
***************
*** 92,104 ****
  	}
  }
  
! static void show_all_irqs(struct seq_file *p)
  {
  	unsigned int i, next = 0;
  
  	for_each_active_irq(i) {
  		show_irq_gap(p, i - next);
! 		seq_put_decimal_ull(p, " ", kstat_irqs_usr(i));
  		next = i + 1;
  	}
  	show_irq_gap(p, nr_irqs - next);
--- 93,105 ----
  	}
  }
  
! static void show_all_irqs(struct seq_file *p, int unrestricted)
  {
  	unsigned int i, next = 0;
  
  	for_each_active_irq(i) {
  		show_irq_gap(p, i - next);
! 		seq_put_decimal_ull(p, " ", unrestricted ? kstat_irqs_usr(i) : 0ULL);
  		next = i + 1;
  	}
  	show_irq_gap(p, nr_irqs - next);
***************
*** 113,118 ****
--- 114,130 ----
  	u64 sum_softirq = 0;
  	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
  	struct timespec64 boottime;
+ 	int unrestricted = 1;
+ #ifdef CONFIG_MINISEC_PROC_ADD
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	if (!uid_eq(current_uid(), GLOBAL_ROOT_UID)
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ 		&& !in_group_p(grsec_proc_gid)
+ #endif
+ 	)
+ 		unrestricted = 0;
+ #endif
+ #endif
  
  	user = nice = system = idle = iowait =
  		irq = softirq = steal = 0;
***************
*** 126,131 ****
--- 138,144 ----
  		nice += kcs->cpustat[CPUTIME_NICE];
  		system += kcs->cpustat[CPUTIME_SYSTEM];
  		idle += get_idle_time(kcs, i);
+ if (unrestricted) {
  		iowait += get_iowait_time(kcs, i);
  		irq += kcs->cpustat[CPUTIME_IRQ];
  		softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
***************
*** 141,148 ****
  			per_softirq_sums[j] += softirq_stat;
  			sum_softirq += softirq_stat;
  		}
  	}
! 	sum += arch_irq_stat();
  
  	seq_put_decimal_ull(p, "cpu  ", nsec_to_clock_t(user));
  	seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
--- 154,164 ----
  			per_softirq_sums[j] += softirq_stat;
  			sum_softirq += softirq_stat;
  		}
+ }
  	}
! 
! 	if (unrestricted)
! 		sum += arch_irq_stat();
  
  	seq_put_decimal_ull(p, "cpu  ", nsec_to_clock_t(user));
  	seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
***************
*** 164,175 ****
--- 180,193 ----
  		nice = kcs->cpustat[CPUTIME_NICE];
  		system = kcs->cpustat[CPUTIME_SYSTEM];
  		idle = get_idle_time(kcs, i);
+ if (unrestricted) {
  		iowait = get_iowait_time(kcs, i);
  		irq = kcs->cpustat[CPUTIME_IRQ];
  		softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
  		steal = kcs->cpustat[CPUTIME_STEAL];
  		guest = kcs->cpustat[CPUTIME_GUEST];
  		guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
+ }
  		seq_printf(p, "cpu%d", i);
  		seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
  		seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
***************
*** 185,203 ****
  	}
  	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
  
! 	show_all_irqs(p);
  
  	seq_printf(p,
  		"\nctxt %llu\n"
  		"btime %llu\n"
! 		"processes %lu\n"
! 		"procs_running %lu\n"
! 		"procs_blocked %lu\n",
! 		nr_context_switches(),
  		(unsigned long long)boottime.tv_sec,
! 		total_forks,
! 		nr_running(),
! 		nr_iowait());
  
  	seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
  
--- 203,221 ----
  	}
  	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
  
! 	show_all_irqs(p, unrestricted);
  
  	seq_printf(p,
  		"\nctxt %llu\n"
  		"btime %llu\n"
! 		"processes %llu\n"
! 		"procs_running %llu\n"
! 		"procs_blocked %llu\n",
! 		unrestricted ? nr_context_switches() : 0ULL,
  		(unsigned long long)boottime.tv_sec,
! 		unrestricted ? total_forks : 0ULL,
! 		unrestricted ? nr_running() : 0ULL,
! 		unrestricted ? nr_iowait() : 0ULL);
  
  	seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq);
  
diff --color -rcNP Master/fs/sysfs/dir.c OG/fs/sysfs/dir.c
*** Master/fs/sysfs/dir.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/sysfs/dir.c	2021-04-20 15:11:34.513000000 -0400
***************
*** 32,37 ****
--- 32,41 ----
  	kfree(buf);
  }
  
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ extern int grsec_enable_sysfs_restrict;
+ #endif
+ 
  /**
   * sysfs_create_dir_ns - create a directory for an object with a namespace tag
   * @kobj: object we're creating directory for
***************
*** 42,51 ****
--- 46,62 ----
  	struct kernfs_node *parent, *kn;
  	kuid_t uid;
  	kgid_t gid;
+ 	const char *name;
+ 	umode_t mode = S_IRWXU | S_IRUGO | S_IXUGO;
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ 	const char *parent_name;
+ #endif
  
  	if (WARN_ON(!kobj))
  		return -EINVAL;
  
+ 	name = kobject_name(kobj);
+ 
  	if (kobj->parent)
  		parent = kobj->parent->sd;
  	else
***************
*** 56,67 ****
  
  	kobject_get_ownership(kobj, &uid, &gid);
  
  	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
! 				  S_IRWXU | S_IRUGO | S_IXUGO, uid, gid,
  				  kobj, ns);
  	if (IS_ERR(kn)) {
  		if (PTR_ERR(kn) == -EEXIST)
! 			sysfs_warn_dup(parent, kobject_name(kobj));
  		return PTR_ERR(kn);
  	}
  
--- 67,92 ----
  
  	kobject_get_ownership(kobj, &uid, &gid);
  
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ 	parent_name = parent->name;
+ 	mode = S_IRWXU;
+ 
+ 	if ((!strcmp(parent_name, "") && (!strcmp(name, "devices") || !strcmp(name, "fs"))) ||
+ 	    (!strcmp(parent_name, "devices") && !strcmp(name, "system")) ||
+ 	    (!strcmp(parent_name, "fs") && (!strcmp(name, "selinux") || !strcmp(name, "fuse") || !strcmp(name, "ecryptfs"))) ||
+ 	    (!strcmp(parent_name, "system") && !strcmp(name, "cpu")))
+ 		mode = S_IRWXU | S_IRUGO | S_IXUGO;
+ 	if (!grsec_enable_sysfs_restrict)
+ 		mode = S_IRWXU | S_IRUGO | S_IXUGO;
+ #endif
+ 
  	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
! 					mode, uid, gid,
  				  kobj, ns);
+ 
  	if (IS_ERR(kn)) {
  		if (PTR_ERR(kn) == -EEXIST)
! 			sysfs_warn_dup(parent, name);
  		return PTR_ERR(kn);
  	}
  
diff --color -rcNP Master/fs/xattr.c OG/fs/xattr.c
*** Master/fs/xattr.c	2021-04-20 14:17:31.000000000 -0400
--- OG/fs/xattr.c	2021-04-20 15:11:34.513000000 -0400
***************
*** 349,354 ****
--- 349,370 ----
  }
  EXPORT_SYMBOL(__vfs_getxattr);
  
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ ssize_t
+ pax_getxattr(struct dentry *dentry, void *value, size_t size)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 	ssize_t error;
+ 
+ 	error = inode_permission(inode, MAY_EXEC);
+ 	if (error)
+ 		return error;
+ 
+ 	return __vfs_getxattr(dentry, inode, XATTR_NAME_USER_PAX_FLAGS, value, size);
+ }
+ EXPORT_SYMBOL(pax_getxattr);
+ #endif
+ 
  ssize_t
  vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
  {
diff --color -rcNP Master/include/linux/binfmts.h OG/include/linux/binfmts.h
*** Master/include/linux/binfmts.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/binfmts.h	2021-04-20 15:11:34.513000000 -0400
***************
*** 105,110 ****
--- 105,113 ----
  	int (*load_binary)(struct linux_binprm *);
  	int (*load_shlib)(struct file *);
  	int (*core_dump)(struct coredump_params *cprm);
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	void (*handle_mprotect)(struct vm_area_struct *vma, unsigned long newflags);
+ #endif
  	unsigned long min_coredump;	/* minimal dump size */
  } __randomize_layout;
  
diff --color -rcNP Master/include/linux/dcache.h OG/include/linux/dcache.h
*** Master/include/linux/dcache.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/dcache.h	2021-04-20 15:11:34.513000000 -0400
***************
*** 108,113 ****
--- 108,116 ----
  		struct list_head d_lru;		/* LRU list */
  		wait_queue_head_t *d_wait;	/* in-lookup ones only */
  	};
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	atomic_t chroot_refcnt;		/* tracks use of directory in chroot */
+ #endif
  	struct list_head d_child;	/* child of parent list */
  	struct list_head d_subdirs;	/* our children */
  	/*
***************
*** 266,272 ****
   * This adds the entry to the hash queues.
   */
  extern void d_rehash(struct dentry *);
!  
  extern void d_add(struct dentry *, struct inode *);
  
  /* used for rename() and baskets */
--- 269,275 ----
   * This adds the entry to the hash queues.
   */
  extern void d_rehash(struct dentry *);
! 
  extern void d_add(struct dentry *, struct inode *);
  
  /* used for rename() and baskets */
***************
*** 305,311 ****
   *	@dentry: dentry to get a reference to
   *
   *	Given a dentry or %NULL pointer increment the reference count
!  *	if appropriate and return the dentry. A dentry will not be 
   *	destroyed when it has references.
   */
  static inline struct dentry *dget_dlock(struct dentry *dentry)
--- 308,314 ----
   *	@dentry: dentry to get a reference to
   *
   *	Given a dentry or %NULL pointer increment the reference count
!  *	if appropriate and return the dentry. A dentry will not be
   *	destroyed when it has references.
   */
  static inline struct dentry *dget_dlock(struct dentry *dentry)
***************
*** 330,336 ****
   *
   *	Returns true if the dentry passed is not currently hashed.
   */
!  
  static inline int d_unhashed(const struct dentry *dentry)
  {
  	return hlist_bl_unhashed(&dentry->d_hash);
--- 333,339 ----
   *
   *	Returns true if the dentry passed is not currently hashed.
   */
! 
  static inline int d_unhashed(const struct dentry *dentry)
  {
  	return hlist_bl_unhashed(&dentry->d_hash);
diff --color -rcNP Master/include/linux/dcache.h.orig OG/include/linux/dcache.h.orig
*** Master/include/linux/dcache.h.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/dcache.h.orig	2021-04-20 15:10:45.384000000 -0400
***************
*** 0 ****
--- 1,604 ----
+ /* SPDX-License-Identifier: GPL-2.0 */
+ #ifndef __LINUX_DCACHE_H
+ #define __LINUX_DCACHE_H
+ 
+ #include <linux/atomic.h>
+ #include <linux/list.h>
+ #include <linux/rculist.h>
+ #include <linux/rculist_bl.h>
+ #include <linux/spinlock.h>
+ #include <linux/seqlock.h>
+ #include <linux/cache.h>
+ #include <linux/rcupdate.h>
+ #include <linux/lockref.h>
+ #include <linux/stringhash.h>
+ #include <linux/wait.h>
+ 
+ struct path;
+ struct vfsmount;
+ 
+ /*
+  * linux/include/linux/dcache.h
+  *
+  * Dirent cache data structures
+  *
+  * (C) Copyright 1997 Thomas Schoebel-Theuer,
+  * with heavy changes by Linus Torvalds
+  */
+ 
+ #define IS_ROOT(x) ((x) == (x)->d_parent)
+ 
+ /* The hash is always the low bits of hash_len */
+ #ifdef __LITTLE_ENDIAN
+  #define HASH_LEN_DECLARE u32 hash; u32 len
+  #define bytemask_from_count(cnt)	(~(~0ul << (cnt)*8))
+ #else
+  #define HASH_LEN_DECLARE u32 len; u32 hash
+  #define bytemask_from_count(cnt)	(~(~0ul >> (cnt)*8))
+ #endif
+ 
+ /*
+  * "quick string" -- eases parameter passing, but more importantly
+  * saves "metadata" about the string (ie length and the hash).
+  *
+  * hash comes first so it snuggles against d_parent in the
+  * dentry.
+  */
+ struct qstr {
+ 	union {
+ 		struct {
+ 			HASH_LEN_DECLARE;
+ 		};
+ 		u64 hash_len;
+ 	};
+ 	const unsigned char *name;
+ };
+ 
+ #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
+ 
+ extern const struct qstr empty_name;
+ extern const struct qstr slash_name;
+ 
+ struct dentry_stat_t {
+ 	long nr_dentry;
+ 	long nr_unused;
+ 	long age_limit;		/* age in seconds */
+ 	long want_pages;	/* pages requested by system */
+ 	long nr_negative;	/* # of unused negative dentries */
+ 	long dummy;		/* Reserved for future use */
+ };
+ extern struct dentry_stat_t dentry_stat;
+ 
+ /*
+  * Try to keep struct dentry aligned on 64 byte cachelines (this will
+  * give reasonable cacheline footprint with larger lines without the
+  * large memory footprint increase).
+  */
+ #ifdef CONFIG_64BIT
+ # define DNAME_INLINE_LEN 32 /* 192 bytes */
+ #else
+ # ifdef CONFIG_SMP
+ #  define DNAME_INLINE_LEN 36 /* 128 bytes */
+ # else
+ #  define DNAME_INLINE_LEN 40 /* 128 bytes */
+ # endif
+ #endif
+ 
+ #define d_lock	d_lockref.lock
+ 
+ struct dentry {
+ 	/* RCU lookup touched fields */
+ 	unsigned int d_flags;		/* protected by d_lock */
+ 	seqcount_t d_seq;		/* per dentry seqlock */
+ 	struct hlist_bl_node d_hash;	/* lookup hash list */
+ 	struct dentry *d_parent;	/* parent directory */
+ 	struct qstr d_name;
+ 	struct inode *d_inode;		/* Where the name belongs to - NULL is
+ 					 * negative */
+ 	unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */
+ 
+ 	/* Ref lookup also touches following */
+ 	struct lockref d_lockref;	/* per-dentry lock and refcount */
+ 	const struct dentry_operations *d_op;
+ 	struct super_block *d_sb;	/* The root of the dentry tree */
+ 	unsigned long d_time;		/* used by d_revalidate */
+ 	void *d_fsdata;			/* fs-specific data */
+ 
+ 	union {
+ 		struct list_head d_lru;		/* LRU list */
+ 		wait_queue_head_t *d_wait;	/* in-lookup ones only */
+ 	};
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	atomic_t chroot_refcnt;		/* tracks use of directory in chroot */
+ #endif
+ 	struct list_head d_child;	/* child of parent list */
+ 	struct list_head d_subdirs;	/* our children */
+ 	/*
+ 	 * d_alias and d_rcu can share memory
+ 	 */
+ 	union {
+ 		struct hlist_node d_alias;	/* inode alias list */
+ 		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
+ 	 	struct rcu_head d_rcu;
+ 	} d_u;
+ } __randomize_layout;
+ 
+ /*
+  * dentry->d_lock spinlock nesting subclasses:
+  *
+  * 0: normal
+  * 1: nested
+  */
+ enum dentry_d_lock_class
+ {
+ 	DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
+ 	DENTRY_D_LOCK_NESTED
+ };
+ 
+ struct dentry_operations {
+ 	int (*d_revalidate)(struct dentry *, unsigned int);
+ 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
+ 	int (*d_hash)(const struct dentry *, struct qstr *);
+ 	int (*d_compare)(const struct dentry *,
+ 			unsigned int, const char *, const struct qstr *);
+ 	int (*d_delete)(const struct dentry *);
+ 	int (*d_init)(struct dentry *);
+ 	void (*d_release)(struct dentry *);
+ 	void (*d_prune)(struct dentry *);
+ 	void (*d_iput)(struct dentry *, struct inode *);
+ 	char *(*d_dname)(struct dentry *, char *, int);
+ 	struct vfsmount *(*d_automount)(struct path *);
+ 	int (*d_manage)(const struct path *, bool);
+ 	struct dentry *(*d_real)(struct dentry *, const struct inode *);
+ } ____cacheline_aligned;
+ 
+ /*
+  * Locking rules for dentry_operations callbacks are to be found in
+  * Documentation/filesystems/locking.rst. Keep it updated!
+  *
+  * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
+  * Keep it updated too!
+  */
+ 
+ /* d_flags entries */
+ #define DCACHE_OP_HASH			0x00000001
+ #define DCACHE_OP_COMPARE		0x00000002
+ #define DCACHE_OP_REVALIDATE		0x00000004
+ #define DCACHE_OP_DELETE		0x00000008
+ #define DCACHE_OP_PRUNE			0x00000010
+ 
+ #define	DCACHE_DISCONNECTED		0x00000020
+      /* This dentry is possibly not currently connected to the dcache tree, in
+       * which case its parent will either be itself, or will have this flag as
+       * well.  nfsd will not use a dentry with this bit set, but will first
+       * endeavour to clear the bit either by discovering that it is connected,
+       * or by performing lookup operations.   Any filesystem which supports
+       * nfsd_operations MUST have a lookup function which, if it finds a
+       * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that
+       * dentry into place and return that dentry rather than the passed one,
+       * typically using d_splice_alias. */
+ 
+ #define DCACHE_REFERENCED		0x00000040 /* Recently used, don't discard. */
+ 
+ #define DCACHE_CANT_MOUNT		0x00000100
+ #define DCACHE_GENOCIDE			0x00000200
+ #define DCACHE_SHRINK_LIST		0x00000400
+ 
+ #define DCACHE_OP_WEAK_REVALIDATE	0x00000800
+ 
+ #define DCACHE_NFSFS_RENAMED		0x00001000
+      /* this dentry has been "silly renamed" and has to be deleted on the last
+       * dput() */
+ #define DCACHE_COOKIE			0x00002000 /* For use by dcookie subsystem */
+ #define DCACHE_FSNOTIFY_PARENT_WATCHED	0x00004000
+      /* Parent inode is watched by some fsnotify listener */
+ 
+ #define DCACHE_DENTRY_KILLED		0x00008000
+ 
+ #define DCACHE_MOUNTED			0x00010000 /* is a mountpoint */
+ #define DCACHE_NEED_AUTOMOUNT		0x00020000 /* handle automount on this dir */
+ #define DCACHE_MANAGE_TRANSIT		0x00040000 /* manage transit from this dirent */
+ #define DCACHE_MANAGED_DENTRY \
+ 	(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
+ 
+ #define DCACHE_LRU_LIST			0x00080000
+ 
+ #define DCACHE_ENTRY_TYPE		0x00700000
+ #define DCACHE_MISS_TYPE		0x00000000 /* Negative dentry (maybe fallthru to nowhere) */
+ #define DCACHE_WHITEOUT_TYPE		0x00100000 /* Whiteout dentry (stop pathwalk) */
+ #define DCACHE_DIRECTORY_TYPE		0x00200000 /* Normal directory */
+ #define DCACHE_AUTODIR_TYPE		0x00300000 /* Lookupless directory (presumed automount) */
+ #define DCACHE_REGULAR_TYPE		0x00400000 /* Regular file type (or fallthru to such) */
+ #define DCACHE_SPECIAL_TYPE		0x00500000 /* Other file type (or fallthru to such) */
+ #define DCACHE_SYMLINK_TYPE		0x00600000 /* Symlink (or fallthru to such) */
+ 
+ #define DCACHE_MAY_FREE			0x00800000
+ #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
+ #define DCACHE_ENCRYPTED_NAME		0x02000000 /* Encrypted name (dir key was unavailable) */
+ #define DCACHE_OP_REAL			0x04000000
+ 
+ #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
+ #define DCACHE_DENTRY_CURSOR		0x20000000
+ #define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
+ 
+ extern seqlock_t rename_lock;
+ 
+ /*
+  * These are the low-level FS interfaces to the dcache..
+  */
+ extern void d_instantiate(struct dentry *, struct inode *);
+ extern void d_instantiate_new(struct dentry *, struct inode *);
+ extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
+ extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *);
+ extern void __d_drop(struct dentry *dentry);
+ extern void d_drop(struct dentry *dentry);
+ extern void d_delete(struct dentry *);
+ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);
+ 
+ /* allocate/de-allocate */
+ extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
+ extern struct dentry * d_alloc_anon(struct super_block *);
+ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
+ 					wait_queue_head_t *);
+ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
+ extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
+ extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
+ extern struct dentry *d_find_any_alias(struct inode *inode);
+ extern struct dentry * d_obtain_alias(struct inode *);
+ extern struct dentry * d_obtain_root(struct inode *);
+ extern void shrink_dcache_sb(struct super_block *);
+ extern void shrink_dcache_parent(struct dentry *);
+ extern void shrink_dcache_for_umount(struct super_block *);
+ extern void d_invalidate(struct dentry *);
+ 
+ /* only used at mount-time */
+ extern struct dentry * d_make_root(struct inode *);
+ 
+ /* <clickety>-<click> the ramfs-type tree */
+ extern void d_genocide(struct dentry *);
+ 
+ extern void d_tmpfile(struct dentry *, struct inode *);
+ 
+ extern struct dentry *d_find_alias(struct inode *);
+ extern void d_prune_aliases(struct inode *);
+ 
+ /* test whether we have any submounts in a subdir tree */
+ extern int path_has_submounts(const struct path *);
+ 
+ /*
+  * This adds the entry to the hash queues.
+  */
+ extern void d_rehash(struct dentry *);
+  
+ extern void d_add(struct dentry *, struct inode *);
+ 
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
+ extern void d_exchange(struct dentry *, struct dentry *);
+ extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
+ 
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
+ extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
+ extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
+ extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
+ 				const struct qstr *name, unsigned *seq);
+ 
+ static inline unsigned d_count(const struct dentry *dentry)
+ {
+ 	return dentry->d_lockref.count;
+ }
+ 
+ /*
+  * helper function for dentry_operations.d_dname() members
+  */
+ extern __printf(4, 5)
+ char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
+ 
+ extern char *__d_path(const struct path *, const struct path *, char *, int);
+ extern char *d_absolute_path(const struct path *, char *, int);
+ extern char *d_path(const struct path *, char *, int);
+ extern char *dentry_path_raw(struct dentry *, char *, int);
+ extern char *dentry_path(struct dentry *, char *, int);
+ 
+ /* Allocation counts.. */
+ 
+ /**
+  *	dget, dget_dlock -	get a reference to a dentry
+  *	@dentry: dentry to get a reference to
+  *
+  *	Given a dentry or %NULL pointer increment the reference count
+  *	if appropriate and return the dentry. A dentry will not be 
+  *	destroyed when it has references.
+  */
+ static inline struct dentry *dget_dlock(struct dentry *dentry)
+ {
+ 	if (dentry)
+ 		dentry->d_lockref.count++;
+ 	return dentry;
+ }
+ 
+ static inline struct dentry *dget(struct dentry *dentry)
+ {
+ 	if (dentry)
+ 		lockref_get(&dentry->d_lockref);
+ 	return dentry;
+ }
+ 
+ extern struct dentry *dget_parent(struct dentry *dentry);
+ 
+ /**
+  *	d_unhashed -	is dentry hashed
+  *	@dentry: entry to check
+  *
+  *	Returns true if the dentry passed is not currently hashed.
+  */
+  
+ static inline int d_unhashed(const struct dentry *dentry)
+ {
+ 	return hlist_bl_unhashed(&dentry->d_hash);
+ }
+ 
+ static inline int d_unlinked(const struct dentry *dentry)
+ {
+ 	return d_unhashed(dentry) && !IS_ROOT(dentry);
+ }
+ 
+ static inline int cant_mount(const struct dentry *dentry)
+ {
+ 	return (dentry->d_flags & DCACHE_CANT_MOUNT);
+ }
+ 
+ static inline void dont_mount(struct dentry *dentry)
+ {
+ 	spin_lock(&dentry->d_lock);
+ 	dentry->d_flags |= DCACHE_CANT_MOUNT;
+ 	spin_unlock(&dentry->d_lock);
+ }
+ 
+ extern void __d_lookup_done(struct dentry *);
+ 
+ static inline int d_in_lookup(const struct dentry *dentry)
+ {
+ 	return dentry->d_flags & DCACHE_PAR_LOOKUP;
+ }
+ 
+ static inline void d_lookup_done(struct dentry *dentry)
+ {
+ 	if (unlikely(d_in_lookup(dentry))) {
+ 		spin_lock(&dentry->d_lock);
+ 		__d_lookup_done(dentry);
+ 		spin_unlock(&dentry->d_lock);
+ 	}
+ }
+ 
+ extern void dput(struct dentry *);
+ 
+ static inline bool d_managed(const struct dentry *dentry)
+ {
+ 	return dentry->d_flags & DCACHE_MANAGED_DENTRY;
+ }
+ 
+ static inline bool d_mountpoint(const struct dentry *dentry)
+ {
+ 	return dentry->d_flags & DCACHE_MOUNTED;
+ }
+ 
+ /*
+  * Directory cache entry type accessor functions.
+  */
+ static inline unsigned __d_entry_type(const struct dentry *dentry)
+ {
+ 	return dentry->d_flags & DCACHE_ENTRY_TYPE;
+ }
+ 
+ static inline bool d_is_miss(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
+ }
+ 
+ static inline bool d_is_whiteout(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
+ }
+ 
+ static inline bool d_can_lookup(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
+ }
+ 
+ static inline bool d_is_autodir(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
+ }
+ 
+ static inline bool d_is_dir(const struct dentry *dentry)
+ {
+ 	return d_can_lookup(dentry) || d_is_autodir(dentry);
+ }
+ 
+ static inline bool d_is_symlink(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
+ }
+ 
+ static inline bool d_is_reg(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
+ }
+ 
+ static inline bool d_is_special(const struct dentry *dentry)
+ {
+ 	return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
+ }
+ 
+ static inline bool d_is_file(const struct dentry *dentry)
+ {
+ 	return d_is_reg(dentry) || d_is_special(dentry);
+ }
+ 
+ static inline bool d_is_negative(const struct dentry *dentry)
+ {
+ 	// TODO: check d_is_whiteout(dentry) also.
+ 	return d_is_miss(dentry);
+ }
+ 
+ static inline bool d_is_positive(const struct dentry *dentry)
+ {
+ 	return !d_is_negative(dentry);
+ }
+ 
+ /**
+  * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
+  * @dentry: The dentry in question
+  *
+  * Returns true if the dentry represents either an absent name or a name that
+  * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
+  * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
+  * fallthrough marker in an opaque directory.
+  *
+  * Note!  (1) This should be used *only* by a filesystem to examine its own
+  * dentries.  It should not be used to look at some other filesystem's
+  * dentries.  (2) It should also be used in combination with d_inode() to get
+  * the inode.  (3) The dentry may have something attached to ->d_lower and the
+  * type field of the flags may be set to something other than miss or whiteout.
+  */
+ static inline bool d_really_is_negative(const struct dentry *dentry)
+ {
+ 	return dentry->d_inode == NULL;
+ }
+ 
+ /**
+  * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
+  * @dentry: The dentry in question
+  *
+  * Returns true if the dentry represents a name that maps to an inode
+  * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
+  * that is represented on medium as a 0,0 chardev.
+  *
+  * Note!  (1) This should be used *only* by a filesystem to examine its own
+  * dentries.  It should not be used to look at some other filesystem's
+  * dentries.  (2) It should also be used in combination with d_inode() to get
+  * the inode.
+  */
+ static inline bool d_really_is_positive(const struct dentry *dentry)
+ {
+ 	return dentry->d_inode != NULL;
+ }
+ 
+ static inline int simple_positive(const struct dentry *dentry)
+ {
+ 	return d_really_is_positive(dentry) && !d_unhashed(dentry);
+ }
+ 
+ extern void d_set_fallthru(struct dentry *dentry);
+ 
+ static inline bool d_is_fallthru(const struct dentry *dentry)
+ {
+ 	return dentry->d_flags & DCACHE_FALLTHRU;
+ }
+ 
+ 
+ extern int sysctl_vfs_cache_pressure;
+ 
+ static inline unsigned long vfs_pressure_ratio(unsigned long val)
+ {
+ 	return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+ }
+ 
+ /**
+  * d_inode - Get the actual inode of this dentry
+  * @dentry: The dentry to query
+  *
+  * This is the helper normal filesystems should use to get at their own inodes
+  * in their own dentries and ignore the layering superimposed upon them.
+  */
+ static inline struct inode *d_inode(const struct dentry *dentry)
+ {
+ 	return dentry->d_inode;
+ }
+ 
+ /**
+  * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
+  * @dentry: The dentry to query
+  *
+  * This is the helper normal filesystems should use to get at their own inodes
+  * in their own dentries and ignore the layering superimposed upon them.
+  */
+ static inline struct inode *d_inode_rcu(const struct dentry *dentry)
+ {
+ 	return READ_ONCE(dentry->d_inode);
+ }
+ 
+ /**
+  * d_backing_inode - Get upper or lower inode we should be using
+  * @upper: The upper layer
+  *
+  * This is the helper that should be used to get at the inode that will be used
+  * if this dentry were to be opened as a file.  The inode may be on the upper
+  * dentry or it may be on a lower dentry pinned by the upper.
+  *
+  * Normal filesystems should not use this to access their own inodes.
+  */
+ static inline struct inode *d_backing_inode(const struct dentry *upper)
+ {
+ 	struct inode *inode = upper->d_inode;
+ 
+ 	return inode;
+ }
+ 
+ /**
+  * d_backing_dentry - Get upper or lower dentry we should be using
+  * @upper: The upper layer
+  *
+  * This is the helper that should be used to get the dentry of the inode that
+  * will be used if this dentry were opened as a file.  It may be the upper
+  * dentry or it may be a lower dentry pinned by the upper.
+  *
+  * Normal filesystems should not use this to access their own dentries.
+  */
+ static inline struct dentry *d_backing_dentry(struct dentry *upper)
+ {
+ 	return upper;
+ }
+ 
+ /**
+  * d_real - Return the real dentry
+  * @dentry: the dentry to query
+  * @inode: inode to select the dentry from multiple layers (can be NULL)
+  *
+  * If dentry is on a union/overlay, then return the underlying, real dentry.
+  * Otherwise return the dentry itself.
+  *
+  * See also: Documentation/filesystems/vfs.rst
+  */
+ static inline struct dentry *d_real(struct dentry *dentry,
+ 				    const struct inode *inode)
+ {
+ 	if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
+ 		return dentry->d_op->d_real(dentry, inode);
+ 	else
+ 		return dentry;
+ }
+ 
+ /**
+  * d_real_inode - Return the real inode
+  * @dentry: The dentry to query
+  *
+  * If dentry is on a union/overlay, then return the underlying, real inode.
+  * Otherwise return d_inode().
+  */
+ static inline struct inode *d_real_inode(const struct dentry *dentry)
+ {
+ 	/* This usage of d_real() results in const dentry */
+ 	return d_backing_inode(d_real((struct dentry *) dentry, NULL));
+ }
+ 
+ struct name_snapshot {
+ 	struct qstr name;
+ 	unsigned char inline_name[DNAME_INLINE_LEN];
+ };
+ void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
+ void release_dentry_name_snapshot(struct name_snapshot *);
+ 
+ #endif	/* __LINUX_DCACHE_H */
diff --color -rcNP Master/include/linux/dcache.h.rej OG/include/linux/dcache.h.rej
*** Master/include/linux/dcache.h.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/dcache.h.rej	2021-04-20 15:11:27.320000000 -0400
***************
*** 0 ****
--- 1,53 ----
+ *** include/linux/dcache.h	2021-03-13 15:20:07.000000000 +0200
+ --- include/linux/dcache.h	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 266,272 ****
+    * This adds the entry to the hash queues.
+    */
+   extern void d_rehash(struct dentry *);
+ ! 
+   extern void d_add(struct dentry *, struct inode *);
+   
+   /* used for rename() and baskets */
+ --- 263,269 ----
+    * This adds the entry to the hash queues.
+    */
+   extern void d_rehash(struct dentry *);
+ ! 
+   extern void d_add(struct dentry *, struct inode *);
+   
+   /* used for rename() and baskets */
+ ***************
+ *** 305,311 ****
+    *	@dentry: dentry to get a reference to
+    *
+    *	Given a dentry or %NULL pointer increment the reference count
+ !  *	if appropriate and return the dentry. A dentry will not be
+    *	destroyed when it has references.
+    */
+   static inline struct dentry *dget_dlock(struct dentry *dentry)
+ --- 302,308 ----
+    *	@dentry: dentry to get a reference to
+    *
+    *	Given a dentry or %NULL pointer increment the reference count
+ !  *	if appropriate and return the dentry. A dentry will not be
+    *	destroyed when it has references.
+    */
+   static inline struct dentry *dget_dlock(struct dentry *dentry)
+ ***************
+ *** 330,336 ****
+    *
+    *	Returns true if the dentry passed is not currently hashed.
+    */
+ ! 
+   static inline int d_unhashed(const struct dentry *dentry)
+   {
+   	return hlist_bl_unhashed(&dentry->d_hash);
+ --- 327,333 ----
+    *
+    *	Returns true if the dentry passed is not currently hashed.
+    */
+ ! 
+   static inline int d_unhashed(const struct dentry *dentry)
+   {
+   	return hlist_bl_unhashed(&dentry->d_hash);
diff --color -rcNP Master/include/linux/elf.h OG/include/linux/elf.h
*** Master/include/linux/elf.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/elf.h	2021-04-20 15:11:34.513000000 -0400
***************
*** 31,36 ****
--- 31,37 ----
  #define elf_addr_t	Elf32_Off
  #define Elf_Half	Elf32_Half
  #define Elf_Word	Elf32_Word
+ #define elf_dyn		Elf32_Dyn
  
  #else
  
***************
*** 42,47 ****
--- 43,49 ----
  #define elf_addr_t	Elf64_Off
  #define Elf_Half	Elf64_Half
  #define Elf_Word	Elf64_Word
+ #define elf_dyn		Elf64_Dyn
  
  #endif
  
diff --color -rcNP Master/include/linux/minisec.h OG/include/linux/minisec.h
*** Master/include/linux/minisec.h	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/minisec.h	2021-04-20 15:11:34.513000000 -0400
***************
*** 0 ****
--- 1,67 ----
+ #ifndef __MINISEC_H
+ #define __MINISEC_H
+ 
+ #include <linux/fs.h>
+ 
+ #define proc_is_chrooted(tsk_a)  ((tsk_a)->gr_is_chrooted)
+ #define have_same_root(tsk_a,tsk_b) ((tsk_a)->gr_chroot_dentry == (tsk_b)->gr_chroot_dentry)
+ #define GR_CHROOT_CAPS {{ \
+ 	CAP_TO_MASK(CAP_LINUX_IMMUTABLE) | CAP_TO_MASK(CAP_NET_ADMIN) | \
+ 	CAP_TO_MASK(CAP_SYS_MODULE) | CAP_TO_MASK(CAP_SYS_RAWIO) | \
+ 	CAP_TO_MASK(CAP_SYS_PACCT) | CAP_TO_MASK(CAP_SYS_ADMIN) | \
+ 	CAP_TO_MASK(CAP_SYS_BOOT) | CAP_TO_MASK(CAP_SYS_TIME) | \
+ 	CAP_TO_MASK(CAP_NET_RAW) | CAP_TO_MASK(CAP_SYS_TTY_CONFIG) | \
+ 	CAP_TO_MASK(CAP_IPC_OWNER) | CAP_TO_MASK(CAP_SETFCAP), \
+ 	CAP_TO_MASK(CAP_SYSLOG) | CAP_TO_MASK(CAP_MAC_ADMIN) }}
+ 
+ extern int grsec_enable_chroot_shmat;
+ extern int grsec_enable_chroot_mount;
+ extern int grsec_enable_chroot_double;
+ extern int grsec_enable_chroot_pivot;
+ extern int grsec_enable_chroot_chdir;
+ extern int grsec_enable_chroot_chmod;
+ extern int grsec_enable_chroot_mknod;
+ extern int grsec_enable_chroot_fchdir;
+ extern int grsec_enable_chroot_nice;
+ extern int grsec_enable_chroot_execlog;
+ extern int grsec_enable_chroot_caps;
+ extern int grsec_enable_chroot_rename;
+ extern int grsec_enable_chroot_sysctl;
+ extern int grsec_enable_chroot_unix;
+ extern int grsec_enable_harden_ipc;
+ extern int grsec_enable_harden_tty;
+ extern int grsec_lock;
+ 
+ int gr_handle_tiocsti(struct tty_struct *tty);
+ 
+ int gr_pid_is_chrooted(struct task_struct *p);
+ int gr_handle_chroot_fowner(struct pid *pid, enum pid_type type);
+ int gr_handle_chroot_nice(void);
+ int gr_handle_chroot_sysctl(const int op);
+ int gr_handle_chroot_setpriority(struct task_struct *p,
+ 					const int niceval);
+ int gr_chroot_fchdir(struct dentry *u_dentry, struct vfsmount *u_mnt);
+ int gr_chroot_pathat(int dfd, struct dentry *u_dentry, struct vfsmount *u_mnt, unsigned flags);
+ int gr_chroot_fhandle(void);
+ int gr_handle_chroot_chroot(const struct dentry *dentry,
+ 				   const struct vfsmount *mnt);
+ void gr_handle_chroot_chdir(const struct path *path);
+ int gr_handle_chroot_chmod(const struct dentry *dentry,
+ 				  const struct vfsmount *mnt, const int mode);
+ int gr_handle_chroot_mknod(const struct dentry *dentry,
+ 				  const struct vfsmount *mnt, const int mode);
+ int gr_handle_chroot_mount(const struct dentry *dentry,
+ 				  const struct vfsmount *mnt,
+ 				  const char *dev_name);
+ int gr_handle_chroot_pivot(void);
+ int gr_handle_chroot_unix(const pid_t pid);
+ 
+ #ifdef CONFIG_MINISEC_CHROOT_FINDTASK
+ extern int grsec_enable_chroot_findtask;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ extern kgid_t grsec_proc_gid;
+ #endif
+ 
+ #endif
diff --color -rcNP Master/include/linux/mm.h OG/include/linux/mm.h
*** Master/include/linux/mm.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/mm.h	2021-04-20 15:11:34.514000000 -0400
***************
*** 276,281 ****
--- 276,286 ----
  #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
  #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
  #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_X86_32)
+ #define VM_PAGEEXEC	0x00800000	/* vma->vm_page_prot needs special handling */
+ #endif
+ 
  #define VM_SYNC		0x00800000	/* Synchronous page faults */
  #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
  #define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
diff --color -rcNP Master/include/linux/mm_types.h OG/include/linux/mm_types.h
*** Master/include/linux/mm_types.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/mm_types.h	2021-04-20 15:11:34.514000000 -0400
***************
*** 526,531 ****
--- 526,540 ----
  		struct work_struct async_put_work;
  	} __randomize_layout;
  
+ #if defined(CONFIG_MINISEC)
+ 		unsigned long pax_flags;
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_ASLR
+ 	unsigned long delta_mmap;		/* randomized offset */
+ 	unsigned long delta_stack;		/* randomized offset */
+ #endif
+ 
  	/*
  	 * The mm_cpumask needs to be at the end of mm_struct, because it
  	 * is dynamically sized based on nr_cpu_ids.
diff --color -rcNP Master/include/linux/proc_fs.h OG/include/linux/proc_fs.h
*** Master/include/linux/proc_fs.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/proc_fs.h	2021-04-20 15:11:34.514000000 -0400
***************
*** 23,30 ****
--- 23,33 ----
  		struct proc_dir_entry *, const char *);
  struct proc_dir_entry *_proc_mkdir(const char *, umode_t, struct proc_dir_entry *, void *, bool);
  extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
+ extern struct proc_dir_entry *proc_mkdir_restrict(const char *, struct proc_dir_entry *);
  extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
  					      struct proc_dir_entry *, void *);
+ extern struct proc_dir_entry *proc_mkdir_data_restrict(const char *, umode_t,
+ 					      struct proc_dir_entry *, void *);
  extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
  					      struct proc_dir_entry *);
  struct proc_dir_entry *proc_create_mount_point(const char *name);
***************
*** 41,53 ****
  		int (*show)(struct seq_file *, void *), void *data);
  #define proc_create_single(name, mode, parent, show) \
  	proc_create_single_data(name, mode, parent, show, NULL)
!  
  extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
  					       struct proc_dir_entry *,
  					       const struct file_operations *,
  					       void *);
  
  struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
  extern void proc_set_size(struct proc_dir_entry *, loff_t);
  extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
  extern void *PDE_DATA(const struct inode *);
--- 44,69 ----
  		int (*show)(struct seq_file *, void *), void *data);
  #define proc_create_single(name, mode, parent, show) \
  	proc_create_single_data(name, mode, parent, show, NULL)
! 
  extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
  					       struct proc_dir_entry *,
  					       const struct file_operations *,
  					       void *);
  
  struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
+ 
+ static inline struct proc_dir_entry *proc_create_grsec(const char *name, umode_t mode,
+ 	struct proc_dir_entry *parent, const struct file_operations *proc_fops)
+ {
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	return proc_create_data(name, S_IRUSR, parent, (const struct proc_ops *)proc_fops, NULL);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	return proc_create_data(name, S_IRUSR | S_IRGRP, parent, (const struct proc_ops *)proc_fops, NULL);
+ #else
+ 	return proc_create_data(name, mode, parent, (const struct proc_ops *)proc_fops, NULL);
+ #endif
+ }
+ 
  extern void proc_set_size(struct proc_dir_entry *, loff_t);
  extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
  extern void *PDE_DATA(const struct inode *);
***************
*** 99,104 ****
--- 115,122 ----
  		struct proc_dir_entry *parent,const char *dest) { return NULL;}
  static inline struct proc_dir_entry *proc_mkdir(const char *name,
  	struct proc_dir_entry *parent) {return NULL;}
+ static inline struct proc_dir_entry *proc_mkdir_restrict(const char *name,
+ 	struct proc_dir_entry *parent) { return NULL; }
  static inline struct proc_dir_entry *proc_create_mount_point(const char *name) { return NULL; }
  static inline struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
  		struct proc_dir_entry *parent, void *data, bool force_lookup)
***************
*** 107,112 ****
--- 125,132 ----
  }
  static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
  	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
+ static inline  struct proc_dir_entry *proc_mkdir_data_restrict(const char *name,
+ 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
  static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
  	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
  #define proc_create_seq_private(name, mode, parent, ops, size, data) ({NULL;})
diff --color -rcNP Master/include/linux/proc_fs.h.orig OG/include/linux/proc_fs.h.orig
*** Master/include/linux/proc_fs.h.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/proc_fs.h.orig	2021-04-20 15:10:45.385000000 -0400
***************
*** 0 ****
--- 1,165 ----
+ /* SPDX-License-Identifier: GPL-2.0 */
+ /*
+  * The proc filesystem constants/structures
+  */
+ #ifndef _LINUX_PROC_FS_H
+ #define _LINUX_PROC_FS_H
+ 
+ #include <linux/types.h>
+ #include <linux/fs.h>
+ 
+ struct proc_dir_entry;
+ struct seq_file;
+ struct seq_operations;
+ 
+ #ifdef CONFIG_PROC_FS
+ 
+ typedef int (*proc_write_t)(struct file *, char *, size_t);
+ 
+ extern void proc_root_init(void);
+ extern void proc_flush_task(struct task_struct *);
+ 
+ extern struct proc_dir_entry *proc_symlink(const char *,
+ 		struct proc_dir_entry *, const char *);
+ struct proc_dir_entry *_proc_mkdir(const char *, umode_t, struct proc_dir_entry *, void *, bool);
+ extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
+ extern struct proc_dir_entry *proc_mkdir_restrict(const char *, struct proc_dir_entry *);
+ extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
+ 					      struct proc_dir_entry *, void *);
+ extern struct proc_dir_entry *proc_mkdir_data_restrict(const char *, umode_t,
+ 					      struct proc_dir_entry *, void *);
+ extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
+ 					      struct proc_dir_entry *);
+ struct proc_dir_entry *proc_create_mount_point(const char *name);
+ 
+ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
+ 		struct proc_dir_entry *parent, const struct seq_operations *ops,
+ 		unsigned int state_size, void *data);
+ #define proc_create_seq_data(name, mode, parent, ops, data) \
+ 	proc_create_seq_private(name, mode, parent, ops, 0, data)
+ #define proc_create_seq(name, mode, parent, ops) \
+ 	proc_create_seq_private(name, mode, parent, ops, 0, NULL)
+ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
+ 		struct proc_dir_entry *parent,
+ 		int (*show)(struct seq_file *, void *), void *data);
+ #define proc_create_single(name, mode, parent, show) \
+ 	proc_create_single_data(name, mode, parent, show, NULL)
+  
+ extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
+ 					       struct proc_dir_entry *,
+ 					       const struct file_operations *,
+ 					       void *);
+ 
+ struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
+ extern void proc_set_size(struct proc_dir_entry *, loff_t);
+ extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
+ extern void *PDE_DATA(const struct inode *);
+ extern void *proc_get_parent_data(const struct inode *);
+ extern void proc_remove(struct proc_dir_entry *);
+ extern void remove_proc_entry(const char *, struct proc_dir_entry *);
+ extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
+ 
+ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
+ 		struct proc_dir_entry *parent, const struct seq_operations *ops,
+ 		unsigned int state_size, void *data);
+ #define proc_create_net(name, mode, parent, state_size, ops) \
+ 	proc_create_net_data(name, mode, parent, state_size, ops, NULL)
+ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
+ 		struct proc_dir_entry *parent,
+ 		int (*show)(struct seq_file *, void *), void *data);
+ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode,
+ 						  struct proc_dir_entry *parent,
+ 						  const struct seq_operations *ops,
+ 						  proc_write_t write,
+ 						  unsigned int state_size, void *data);
+ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode,
+ 						    struct proc_dir_entry *parent,
+ 						    int (*show)(struct seq_file *, void *),
+ 						    proc_write_t write,
+ 						    void *data);
+ extern struct pid *tgid_pidfd_to_pid(const struct file *file);
+ 
+ #ifdef CONFIG_PROC_PID_ARCH_STATUS
+ /*
+  * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
+  * provide proc_pid_arch_status() definition.
+  */
+ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
+ 			struct pid *pid, struct task_struct *task);
+ #endif /* CONFIG_PROC_PID_ARCH_STATUS */
+ 
+ #else /* CONFIG_PROC_FS */
+ 
+ static inline void proc_root_init(void)
+ {
+ }
+ 
+ static inline void proc_flush_task(struct task_struct *task)
+ {
+ }
+ 
+ static inline struct proc_dir_entry *proc_symlink(const char *name,
+ 		struct proc_dir_entry *parent,const char *dest) { return NULL;}
+ static inline struct proc_dir_entry *proc_mkdir(const char *name,
+ 	struct proc_dir_entry *parent) {return NULL;}
+ static inline struct proc_dir_entry *proc_mkdir_restrict(const char *name,
+ 	struct proc_dir_entry *parent) { return NULL; }
+ static inline struct proc_dir_entry *proc_create_mount_point(const char *name) { return NULL; }
+ static inline struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode,
+ 		struct proc_dir_entry *parent, void *data, bool force_lookup)
+ {
+ 	return NULL;
+ }
+ static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
+ 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
+ static inline  struct proc_dir_entry *proc_mkdir_data_restrict(const char *name,
+ 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
+ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
+ 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
+ #define proc_create_seq_private(name, mode, parent, ops, size, data) ({NULL;})
+ #define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;})
+ #define proc_create_seq(name, mode, parent, ops) ({NULL;})
+ #define proc_create_single(name, mode, parent, show) ({NULL;})
+ #define proc_create_single_data(name, mode, parent, show, data) ({NULL;})
+ #define proc_create(name, mode, parent, proc_fops) ({NULL;})
+ #define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;})
+ 
+ static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
+ static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
+ static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;}
+ static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; }
+ 
+ static inline void proc_remove(struct proc_dir_entry *de) {}
+ #define remove_proc_entry(name, parent) do {} while (0)
+ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; }
+ 
+ #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;})
+ #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
+ #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
+ 
+ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
+ {
+ 	return ERR_PTR(-EBADF);
+ }
+ 
+ #endif /* CONFIG_PROC_FS */
+ 
+ struct net;
+ 
+ static inline struct proc_dir_entry *proc_net_mkdir(
+ 	struct net *net, const char *name, struct proc_dir_entry *parent)
+ {
+ 	return _proc_mkdir(name, 0, parent, net, true);
+ }
+ 
+ struct ns_common;
+ int open_related_ns(struct ns_common *ns,
+ 		   struct ns_common *(*get_ns)(struct ns_common *ns));
+ 
+ /* get the associated pid namespace for a file in procfs */
+ static inline struct pid_namespace *proc_pid_ns(const struct inode *inode)
+ {
+ 	return inode->i_sb->s_fs_info;
+ }
+ 
+ #endif /* _LINUX_PROC_FS_H */
diff --color -rcNP Master/include/linux/proc_fs.h.rej OG/include/linux/proc_fs.h.rej
*** Master/include/linux/proc_fs.h.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/proc_fs.h.rej	2021-04-20 15:11:27.321000000 -0400
***************
*** 0 ****
--- 1,44 ----
+ *** include/linux/proc_fs.h	2021-03-13 15:30:34.000000000 +0200
+ --- include/linux/proc_fs.h	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 41,66 ****
+   		int (*show)(struct seq_file *, void *), void *data);
+   #define proc_create_single(name, mode, parent, show) \
+   	proc_create_single_data(name, mode, parent, show, NULL)
+ ! 
+   extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
+   					       struct proc_dir_entry *,
+   					       const struct file_operations *,
+   					       void *);
+   
+   struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
+ - 
+ - static inline struct proc_dir_entry *proc_create_grsec(const char *name, umode_t mode,
+ - 	struct proc_dir_entry *parent, const struct file_operations *proc_fops)
+ - {
+ - #ifdef CONFIG_MINISEC_PROC_USER
+ - 	return proc_create_data(name, S_IRUSR, parent, proc_fops, NULL);
+ - #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ - 	return proc_create_data(name, S_IRUSR | S_IRGRP, parent, proc_fops, NULL);
+ - #else
+ - 	return proc_create_data(name, mode, parent, proc_fops, NULL);
+ - #endif
+ - }
+ - 
+   extern void proc_set_size(struct proc_dir_entry *, loff_t);
+   extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
+   extern void *PDE_DATA(const struct inode *);
+ --- 38,50 ----
+   		int (*show)(struct seq_file *, void *), void *data);
+   #define proc_create_single(name, mode, parent, show) \
+   	proc_create_single_data(name, mode, parent, show, NULL)
+ ! 
+   extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
+   					       struct proc_dir_entry *,
+   					       const struct file_operations *,
+   					       void *);
+   
+   struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
+   extern void proc_set_size(struct proc_dir_entry *, loff_t);
+   extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
+   extern void *PDE_DATA(const struct inode *);
diff --color -rcNP Master/include/linux/random.h OG/include/linux/random.h
*** Master/include/linux/random.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/random.h	2021-04-20 15:11:34.514000000 -0400
***************
*** 144,147 ****
--- 144,152 ----
  }
  #endif
  
+ static inline unsigned long pax_get_random_long(void)
+ {
+ 	return prandom_u32() + (sizeof(long) > 4 ? (unsigned long)prandom_u32() << 32 : 0);
+ }
+ 
  #endif /* _LINUX_RANDOM_H */
diff --color -rcNP Master/include/linux/sched/signal.h OG/include/linux/sched/signal.h
*** Master/include/linux/sched/signal.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/sched/signal.h	2021-04-20 15:11:34.514000000 -0400
***************
*** 206,211 ****
--- 206,220 ----
  #ifdef CONFIG_TASKSTATS
  	struct taskstats *stats;
  #endif
+ #ifdef CONFIG_MINISEC
+ 	u32 curr_ip;
+ 	u32 saved_ip;
+ 	u32 gr_saddr;
+ 	u32 gr_daddr;
+ 	u16 gr_sport;
+ 	u16 gr_dport;
+ 	u8 used_accept:1;
+ #endif
  #ifdef CONFIG_AUDIT
  	unsigned audit_tty;
  	struct tty_audit_buf *tty_audit_buf;
diff --color -rcNP Master/include/linux/sched.h OG/include/linux/sched.h
*** Master/include/linux/sched.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/sched.h	2021-04-20 15:16:03.348000000 -0400
***************
*** 27,32 ****
--- 27,33 ----
  #include <linux/sched/prio.h>
  #include <linux/sched/types.h>
  #include <linux/signal_types.h>
+ #include <linux/mm_types.h>
  #include <linux/mm_types_task.h>
  #include <linux/task_io_accounting.h>
  #include <linux/posix-timers.h>
***************
*** 1268,1273 ****
--- 1269,1279 ----
  	unsigned long			prev_lowest_stack;
  #endif
  
+ #ifdef CONFIG_MINISEC
+ 	struct dentry *gr_chroot_dentry;
+ 	u8 gr_is_chrooted;
+ #endif
+ 
  	/*
  	 * New fields for task_struct should be added above here, so that
  	 * they are included in the randomized portion of task_struct.
***************
*** 1663,1668 ****
--- 1669,1675 ----
   */
  
  extern struct task_struct *find_task_by_vpid(pid_t nr);
+ extern struct task_struct *find_task_by_vpid_unrestricted(pid_t nr);
  extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);
  
  /*
***************
*** 1857,1862 ****
--- 1864,1908 ----
  #define TASK_SIZE_OF(tsk)	TASK_SIZE
  #endif
  
+ #define MF_PAX_PAGEEXEC		0x01000000	/* Paging based non-executable pages */
+ #define MF_PAX_EMUTRAMP		0x02000000	/* Emulate trampolines */
+ #define MF_PAX_MPROTECT		0x04000000	/* Restrict mprotect() */
+ #define MF_PAX_RANDMMAP		0x08000000	/* Randomize mmap() base */
+ #define MF_PAX_SEGMEXEC		0x20000000	/* Segmentation based non-executable pages */
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ extern int pax_softmode;
+ #endif
+ 
+ extern int pax_check_flags(unsigned long *);
+ #define PAX_PARSE_FLAGS_FALLBACK	(~0UL)
+ 
+ /* if tsk != current then task_lock must be held on it */
+ #if defined(CONFIG_MINISEC_NOEXEC) || defined(CONFIG_MINISEC_ASLR)
+ static inline unsigned long pax_get_flags(struct task_struct *tsk)
+ {
+ 	if (likely(tsk))
+ 		return tsk->mm->pax_flags;
+ 	else
+ 		return 0UL;
+ }
+ 
+ /* if tsk != current then task_lock must be held on it */
+ static inline long pax_set_flags(struct task_struct *tsk, unsigned long flags)
+ {
+ 	if (likely(tsk)) {
+ 		tsk->mm->pax_flags = flags;
+ 		return 0;
+ 	}
+ 	return -EINVAL;
+ }
+ #endif
+ 
+ struct path;
+ extern char *pax_get_path(const struct path *path, char *buf, int buflen);
+ extern void pax_report_fault(struct pt_regs *regs, void *pc, void *sp);
+ extern void pax_report_insns(struct pt_regs *regs, void *pc, void *sp);
+ 
  #ifdef CONFIG_RSEQ
  
  /*
diff --color -rcNP Master/include/linux/sysctl.h OG/include/linux/sysctl.h
*** Master/include/linux/sysctl.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/sysctl.h	2021-04-20 15:11:34.515000000 -0400
***************
*** 8,14 ****
   ****************************************************************
   **
   **  WARNING:
!  **  The values in this file are exported to user space via 
   **  the sysctl() binary interface.  Do *NOT* change the
   **  numbering of any existing values here, and do not change
   **  any numbers within any one set of values.  If you have to
--- 8,14 ----
   ****************************************************************
   **
   **  WARNING:
!  **  The values in this file are exported to user space via
   **  the sysctl() binary interface.  Do *NOT* change the
   **  numbering of any existing values here, and do not change
   **  any numbers within any one set of values.  If you have to
***************
*** 51,56 ****
--- 51,58 ----
  			 void __user *, size_t *, loff_t *);
  extern int proc_dointvec(struct ctl_table *, int,
  			 void __user *, size_t *, loff_t *);
+ int proc_dointvec_secure(struct ctl_table *, int,
+                     		void __user *, size_t *, loff_t *);
  extern int proc_douintvec(struct ctl_table *, int,
  			 void __user *, size_t *, loff_t *);
  extern int proc_dointvec_minmax(struct ctl_table *, int,
***************
*** 78,84 ****
  
  /*
   * Register a set of sysctl names by calling register_sysctl_table
!  * with an initialised array of struct ctl_table's.  An entry with 
   * NULL procname terminates the table.  table->de will be
   * set up by the registration and need not be initialised in advance.
   *
--- 80,86 ----
  
  /*
   * Register a set of sysctl names by calling register_sysctl_table
!  * with an initialised array of struct ctl_table's.  An entry with
   * NULL procname terminates the table.  table->de will be
   * set up by the registration and need not be initialised in advance.
   *
***************
*** 96,102 ****
   * the sysctl table.  The data and maxlen fields of the ctl_table
   * struct enable minimal validation of the values being written to be
   * performed, and the mode field allows minimal authentication.
!  * 
   * There must be a proc_handler routine for any terminal nodes
   * mirrored under /proc/sys (non-terminals are handled by a built-in
   * directory handler).  Several default handlers are available to
--- 98,104 ----
   * the sysctl table.  The data and maxlen fields of the ctl_table
   * struct enable minimal validation of the values being written to be
   * performed, and the mode field allows minimal authentication.
!  *
   * There must be a proc_handler routine for any terminal nodes
   * mirrored under /proc/sys (non-terminals are handled by a built-in
   * directory handler).  Several default handlers are available to
***************
*** 122,128 ****
  	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
  
  /* A sysctl table is an array of struct ctl_table: */
! struct ctl_table 
  {
  	const char *procname;		/* Text ID for /proc/sys, or zero */
  	void *data;
--- 124,130 ----
  	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
  
  /* A sysctl table is an array of struct ctl_table: */
! struct ctl_table
  {
  	const char *procname;		/* Text ID for /proc/sys, or zero */
  	void *data;
diff --color -rcNP Master/include/linux/sysctl.h.orig OG/include/linux/sysctl.h.orig
*** Master/include/linux/sysctl.h.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/sysctl.h.orig	2021-04-20 15:10:45.386000000 -0400
***************
*** 0 ****
--- 1,250 ----
+ /* SPDX-License-Identifier: GPL-2.0 */
+ /*
+  * sysctl.h: General linux system control interface
+  *
+  * Begun 24 March 1995, Stephen Tweedie
+  *
+  ****************************************************************
+  ****************************************************************
+  **
+  **  WARNING:
+  **  The values in this file are exported to user space via 
+  **  the sysctl() binary interface.  Do *NOT* change the
+  **  numbering of any existing values here, and do not change
+  **  any numbers within any one set of values.  If you have to
+  **  redefine an existing interface, use a new number for it.
+  **  The kernel will then return -ENOTDIR to any application using
+  **  the old binary interface.
+  **
+  ****************************************************************
+  ****************************************************************
+  */
+ #ifndef _LINUX_SYSCTL_H
+ #define _LINUX_SYSCTL_H
+ 
+ #include <linux/list.h>
+ #include <linux/rcupdate.h>
+ #include <linux/wait.h>
+ #include <linux/rbtree.h>
+ #include <linux/uidgid.h>
+ #include <uapi/linux/sysctl.h>
+ 
+ /* For the /proc/sys support */
+ struct completion;
+ struct ctl_table;
+ struct nsproxy;
+ struct ctl_table_root;
+ struct ctl_table_header;
+ struct ctl_dir;
+ 
+ /* Keep the same order as in fs/proc/proc_sysctl.c */
+ #define SYSCTL_ZERO	((void *)&sysctl_vals[0])
+ #define SYSCTL_ONE	((void *)&sysctl_vals[1])
+ #define SYSCTL_INT_MAX	((void *)&sysctl_vals[2])
+ 
+ extern const int sysctl_vals[];
+ 
+ typedef int proc_handler (struct ctl_table *ctl, int write,
+ 			  void __user *buffer, size_t *lenp, loff_t *ppos);
+ 
+ extern int proc_dostring(struct ctl_table *, int,
+ 			 void __user *, size_t *, loff_t *);
+ extern int proc_dointvec(struct ctl_table *, int,
+ 			 void __user *, size_t *, loff_t *);
+ int proc_dointvec_secure(struct ctl_table *, int,
+                     		void __user *, size_t *, loff_t *);
+ extern int proc_douintvec(struct ctl_table *, int,
+ 			 void __user *, size_t *, loff_t *);
+ extern int proc_dointvec_minmax(struct ctl_table *, int,
+ 				void __user *, size_t *, loff_t *);
+ extern int proc_douintvec_minmax(struct ctl_table *table, int write,
+ 				 void __user *buffer, size_t *lenp,
+ 				 loff_t *ppos);
+ extern int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ 					 void *buffer, size_t *lenp, loff_t *ppos);
+ extern int proc_dointvec_jiffies(struct ctl_table *, int,
+ 				 void __user *, size_t *, loff_t *);
+ extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
+ 					void __user *, size_t *, loff_t *);
+ extern int proc_dointvec_ms_jiffies(struct ctl_table *, int,
+ 				    void __user *, size_t *, loff_t *);
+ extern int proc_doulongvec_minmax(struct ctl_table *, int,
+ 				  void __user *, size_t *, loff_t *);
+ extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
+ 				      void __user *, size_t *, loff_t *);
+ extern int proc_do_large_bitmap(struct ctl_table *, int,
+ 				void __user *, size_t *, loff_t *);
+ extern int proc_do_static_key(struct ctl_table *table, int write,
+ 			      void __user *buffer, size_t *lenp,
+ 			      loff_t *ppos);
+ 
+ /*
+  * Register a set of sysctl names by calling register_sysctl_table
+  * with an initialised array of struct ctl_table's.  An entry with 
+  * NULL procname terminates the table.  table->de will be
+  * set up by the registration and need not be initialised in advance.
+  *
+  * sysctl names can be mirrored automatically under /proc/sys.  The
+  * procname supplied controls /proc naming.
+  *
+  * The table's mode will be honoured both for sys_sysctl(2) and
+  * proc-fs access.
+  *
+  * Leaf nodes in the sysctl tree will be represented by a single file
+  * under /proc; non-leaf nodes will be represented by directories.  A
+  * null procname disables /proc mirroring at this node.
+  *
+  * sysctl(2) can automatically manage read and write requests through
+  * the sysctl table.  The data and maxlen fields of the ctl_table
+  * struct enable minimal validation of the values being written to be
+  * performed, and the mode field allows minimal authentication.
+  * 
+  * There must be a proc_handler routine for any terminal nodes
+  * mirrored under /proc/sys (non-terminals are handled by a built-in
+  * directory handler).  Several default handlers are available to
+  * cover common cases.
+  */
+ 
+ /* Support for userspace poll() to watch for changes */
+ struct ctl_table_poll {
+ 	atomic_t event;
+ 	wait_queue_head_t wait;
+ };
+ 
+ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll)
+ {
+ 	return (void *)(unsigned long)atomic_read(&poll->event);
+ }
+ 
+ #define __CTL_TABLE_POLL_INITIALIZER(name) {				\
+ 	.event = ATOMIC_INIT(0),					\
+ 	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) }
+ 
+ #define DEFINE_CTL_TABLE_POLL(name)					\
+ 	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
+ 
+ /* A sysctl table is an array of struct ctl_table: */
+ struct ctl_table 
+ {
+ 	const char *procname;		/* Text ID for /proc/sys, or zero */
+ 	void *data;
+ 	int maxlen;
+ 	umode_t mode;
+ 	struct ctl_table *child;	/* Deprecated */
+ 	proc_handler *proc_handler;	/* Callback for text formatting */
+ 	struct ctl_table_poll *poll;
+ 	void *extra1;
+ 	void *extra2;
+ } __randomize_layout;
+ 
+ struct ctl_node {
+ 	struct rb_node node;
+ 	struct ctl_table_header *header;
+ };
+ 
+ /* struct ctl_table_header is used to maintain dynamic lists of
+    struct ctl_table trees. */
+ struct ctl_table_header
+ {
+ 	union {
+ 		struct {
+ 			struct ctl_table *ctl_table;
+ 			int used;
+ 			int count;
+ 			int nreg;
+ 		};
+ 		struct rcu_head rcu;
+ 	};
+ 	struct completion *unregistering;
+ 	struct ctl_table *ctl_table_arg;
+ 	struct ctl_table_root *root;
+ 	struct ctl_table_set *set;
+ 	struct ctl_dir *parent;
+ 	struct ctl_node *node;
+ 	struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
+ };
+ 
+ struct ctl_dir {
+ 	/* Header must be at the start of ctl_dir */
+ 	struct ctl_table_header header;
+ 	struct rb_root root;
+ };
+ 
+ struct ctl_table_set {
+ 	int (*is_seen)(struct ctl_table_set *);
+ 	struct ctl_dir dir;
+ };
+ 
+ struct ctl_table_root {
+ 	struct ctl_table_set default_set;
+ 	struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
+ 	void (*set_ownership)(struct ctl_table_header *head,
+ 			      struct ctl_table *table,
+ 			      kuid_t *uid, kgid_t *gid);
+ 	int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
+ };
+ 
+ /* struct ctl_path describes where in the hierarchy a table is added */
+ struct ctl_path {
+ 	const char *procname;
+ };
+ 
+ #ifdef CONFIG_SYSCTL
+ 
+ void proc_sys_poll_notify(struct ctl_table_poll *poll);
+ 
+ extern void setup_sysctl_set(struct ctl_table_set *p,
+ 	struct ctl_table_root *root,
+ 	int (*is_seen)(struct ctl_table_set *));
+ extern void retire_sysctl_set(struct ctl_table_set *set);
+ 
+ struct ctl_table_header *__register_sysctl_table(
+ 	struct ctl_table_set *set,
+ 	const char *path, struct ctl_table *table);
+ struct ctl_table_header *__register_sysctl_paths(
+ 	struct ctl_table_set *set,
+ 	const struct ctl_path *path, struct ctl_table *table);
+ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table);
+ struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
+ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
+ 						struct ctl_table *table);
+ 
+ void unregister_sysctl_table(struct ctl_table_header * table);
+ 
+ extern int sysctl_init(void);
+ 
+ extern struct ctl_table sysctl_mount_point[];
+ 
+ #else /* CONFIG_SYSCTL */
+ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
+ {
+ 	return NULL;
+ }
+ 
+ static inline struct ctl_table_header *register_sysctl_paths(
+ 			const struct ctl_path *path, struct ctl_table *table)
+ {
+ 	return NULL;
+ }
+ 
+ static inline struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+ {
+ 	return NULL;
+ }
+ 
+ static inline void unregister_sysctl_table(struct ctl_table_header * table)
+ {
+ }
+ 
+ static inline void setup_sysctl_set(struct ctl_table_set *p,
+ 	struct ctl_table_root *root,
+ 	int (*is_seen)(struct ctl_table_set *))
+ {
+ }
+ 
+ #endif /* CONFIG_SYSCTL */
+ 
+ int sysctl_max_threads(struct ctl_table *table, int write,
+ 		       void __user *buffer, size_t *lenp, loff_t *ppos);
+ 
+ #endif /* _LINUX_SYSCTL_H */
diff --color -rcNP Master/include/linux/sysctl.h.rej OG/include/linux/sysctl.h.rej
*** Master/include/linux/sysctl.h.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/include/linux/sysctl.h.rej	2021-04-20 15:11:27.323000000 -0400
***************
*** 0 ****
--- 1,70 ----
+ *** include/linux/sysctl.h	2021-03-13 15:39:03.000000000 +0200
+ --- include/linux/sysctl.h	2021-03-13 19:47:10.000000000 +0200
+ ***************
+ *** 8,14 ****
+    ****************************************************************
+    **
+    **  WARNING:
+ !  **  The values in this file are exported to user space via
+    **  the sysctl() binary interface.  Do *NOT* change the
+    **  numbering of any existing values here, and do not change
+    **  any numbers within any one set of values.  If you have to
+ --- 8,14 ----
+    ****************************************************************
+    **
+    **  WARNING:
+ !  **  The values in this file are exported to user space via
+    **  the sysctl() binary interface.  Do *NOT* change the
+    **  numbering of any existing values here, and do not change
+    **  any numbers within any one set of values.  If you have to
+ ***************
+ *** 78,84 ****
+   
+   /*
+    * Register a set of sysctl names by calling register_sysctl_table
+ !  * with an initialised array of struct ctl_table's.  An entry with
+    * NULL procname terminates the table.  table->de will be
+    * set up by the registration and need not be initialised in advance.
+    *
+ --- 76,82 ----
+   
+   /*
+    * Register a set of sysctl names by calling register_sysctl_table
+ !  * with an initialised array of struct ctl_table's.  An entry with
+    * NULL procname terminates the table.  table->de will be
+    * set up by the registration and need not be initialised in advance.
+    *
+ ***************
+ *** 96,102 ****
+    * the sysctl table.  The data and maxlen fields of the ctl_table
+    * struct enable minimal validation of the values being written to be
+    * performed, and the mode field allows minimal authentication.
+ !  *
+    * There must be a proc_handler routine for any terminal nodes
+    * mirrored under /proc/sys (non-terminals are handled by a built-in
+    * directory handler).  Several default handlers are available to
+ --- 94,100 ----
+    * the sysctl table.  The data and maxlen fields of the ctl_table
+    * struct enable minimal validation of the values being written to be
+    * performed, and the mode field allows minimal authentication.
+ !  *
+    * There must be a proc_handler routine for any terminal nodes
+    * mirrored under /proc/sys (non-terminals are handled by a built-in
+    * directory handler).  Several default handlers are available to
+ ***************
+ *** 122,128 ****
+   	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
+   
+   /* A sysctl table is an array of struct ctl_table: */
+ ! struct ctl_table
+   {
+   	const char *procname;		/* Text ID for /proc/sys, or zero */
+   	void *data;
+ --- 120,126 ----
+   	struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name)
+   
+   /* A sysctl table is an array of struct ctl_table: */
+ ! struct ctl_table
+   {
+   	const char *procname;		/* Text ID for /proc/sys, or zero */
+   	void *data;
diff --color -rcNP Master/include/linux/uidgid.h OG/include/linux/uidgid.h
*** Master/include/linux/uidgid.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/uidgid.h	2021-04-20 15:11:34.515000000 -0400
***************
*** 188,191 ****
--- 188,197 ----
  
  #endif /* CONFIG_USER_NS */
  
+ #define GR_GLOBAL_UID(x) from_kuid_munged(&init_user_ns, (x))
+ #define GR_GLOBAL_GID(x) from_kgid_munged(&init_user_ns, (x))
+ #define gr_is_global_root(x) uid_eq((x), GLOBAL_ROOT_UID)
+ #define gr_is_global_nonroot(x) (!uid_eq((x), GLOBAL_ROOT_UID))
+ #define gr_is_global_nonroot_gid(x) (!gid_eq((x), GLOBAL_ROOT_GID))
+ 
  #endif /* _LINUX_UIDGID_H */
diff --color -rcNP Master/include/linux/xattr.h OG/include/linux/xattr.h
*** Master/include/linux/xattr.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/linux/xattr.h	2021-04-20 15:11:34.515000000 -0400
***************
*** 46,51 ****
--- 46,54 ----
  	size_t value_len;
  };
  
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ ssize_t pax_getxattr(struct dentry *, void *, size_t);
+ #endif
  ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
  ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t);
  ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
diff --color -rcNP Master/include/uapi/linux/elf.h OG/include/uapi/linux/elf.h
*** Master/include/uapi/linux/elf.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/uapi/linux/elf.h	2021-04-20 15:11:34.515000000 -0400
***************
*** 38,44 ****
--- 38,52 ----
  #define PT_GNU_EH_FRAME		0x6474e550
  
  #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
+ #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
  
+ #define PT_PAX_FLAGS	(PT_LOOS + 0x5041580)
+ 
+ #define EF_PAX_PAGEEXEC		1	/* Paging based non-executable pages */
+ #define EF_PAX_EMUTRAMP		2	/* Emulate trampolines */
+ #define EF_PAX_MPROTECT		4	/* Restrict mprotect() */
+ #define EF_PAX_RANDMMAP		8	/* Randomize mmap() base */
+ #define EF_PAX_SEGMEXEC		32	/* Segmentation based non-executable pages */
  /*
   * Extended Numbering
   *
***************
*** 95,100 ****
--- 103,110 ----
  #define DT_DEBUG	21
  #define DT_TEXTREL	22
  #define DT_JMPREL	23
+ #define DT_FLAGS	30
+   #define DF_TEXTREL  0x00000004
  #define DT_ENCODING	32
  #define OLD_DT_LOOS	0x60000000
  #define DT_LOOS		0x6000000d
***************
*** 241,246 ****
--- 251,267 ----
  #define PF_W		0x2
  #define PF_X		0x1
  
+ #define PF_PAGEEXEC	(1U << 4)	/* Enable  PAGEEXEC */
+ #define PF_NOPAGEEXEC	(1U << 5)	/* Disable PAGEEXEC */
+ #define PF_SEGMEXEC	(1U << 6)	/* Enable  SEGMEXEC */
+ #define PF_NOSEGMEXEC	(1U << 7)	/* Disable SEGMEXEC */
+ #define PF_MPROTECT	(1U << 8)	/* Enable  MPROTECT */
+ #define PF_NOMPROTECT	(1U << 9)	/* Disable MPROTECT */
+ #define PF_EMUTRAMP	(1U << 12)	/* Enable  EMUTRAMP */
+ #define PF_NOEMUTRAMP	(1U << 13)	/* Disable EMUTRAMP */
+ #define PF_RANDMMAP	(1U << 14)	/* Enable  RANDMMAP */
+ #define PF_NORANDMMAP	(1U << 15)	/* Disable RANDMMAP */
+ 
  typedef struct elf32_phdr{
    Elf32_Word	p_type;
    Elf32_Off	p_offset;
***************
*** 299,305 ****
  #define SHN_ABS		0xfff1
  #define SHN_COMMON	0xfff2
  #define SHN_HIRESERVE	0xffff
!  
  typedef struct elf32_shdr {
    Elf32_Word	sh_name;
    Elf32_Word	sh_type;
--- 320,326 ----
  #define SHN_ABS		0xfff1
  #define SHN_COMMON	0xfff2
  #define SHN_HIRESERVE	0xffff
! 
  typedef struct elf32_shdr {
    Elf32_Word	sh_name;
    Elf32_Word	sh_type;
***************
*** 336,341 ****
--- 357,364 ----
  #define	EI_OSABI	7
  #define	EI_PAD		8
  
+ #define	EI_PAX		14
+ 
  #define	ELFMAG0		0x7f		/* EI_MAG */
  #define	ELFMAG1		'E'
  #define	ELFMAG2		'L'
diff --color -rcNP Master/include/uapi/linux/elf.h.orig OG/include/uapi/linux/elf.h.orig
*** Master/include/uapi/linux/elf.h.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/include/uapi/linux/elf.h.orig	2021-04-20 15:10:45.387000000 -0400
***************
*** 0 ****
--- 1,469 ----
+ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+ #ifndef _UAPI_LINUX_ELF_H
+ #define _UAPI_LINUX_ELF_H
+ 
+ #include <linux/types.h>
+ #include <linux/elf-em.h>
+ 
+ /* 32-bit ELF base types. */
+ typedef __u32	Elf32_Addr;
+ typedef __u16	Elf32_Half;
+ typedef __u32	Elf32_Off;
+ typedef __s32	Elf32_Sword;
+ typedef __u32	Elf32_Word;
+ 
+ /* 64-bit ELF base types. */
+ typedef __u64	Elf64_Addr;
+ typedef __u16	Elf64_Half;
+ typedef __s16	Elf64_SHalf;
+ typedef __u64	Elf64_Off;
+ typedef __s32	Elf64_Sword;
+ typedef __u32	Elf64_Word;
+ typedef __u64	Elf64_Xword;
+ typedef __s64	Elf64_Sxword;
+ 
+ /* These constants are for the segment types stored in the image headers */
+ #define PT_NULL    0
+ #define PT_LOAD    1
+ #define PT_DYNAMIC 2
+ #define PT_INTERP  3
+ #define PT_NOTE    4
+ #define PT_SHLIB   5
+ #define PT_PHDR    6
+ #define PT_TLS     7               /* Thread local storage segment */
+ #define PT_LOOS    0x60000000      /* OS-specific */
+ #define PT_HIOS    0x6fffffff      /* OS-specific */
+ #define PT_LOPROC  0x70000000
+ #define PT_HIPROC  0x7fffffff
+ #define PT_GNU_EH_FRAME		0x6474e550
+ 
+ #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
+ #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
+ 
+ #define PT_PAX_FLAGS	(PT_LOOS + 0x5041580)
+ 
+ #define EF_PAX_PAGEEXEC		1	/* Paging based non-executable pages */
+ #define EF_PAX_EMUTRAMP		2	/* Emulate trampolines */
+ #define EF_PAX_MPROTECT		4	/* Restrict mprotect() */
+ #define EF_PAX_RANDMMAP		8	/* Randomize mmap() base */
+ #define EF_PAX_SEGMEXEC		32	/* Segmentation based non-executable pages */
+ /*
+  * Extended Numbering
+  *
+  * If the real number of program header table entries is larger than
+  * or equal to PN_XNUM(0xffff), it is set to sh_info field of the
+  * section header at index 0, and PN_XNUM is set to e_phnum
+  * field. Otherwise, the section header at index 0 is zero
+  * initialized, if it exists.
+  *
+  * Specifications are available in:
+  *
+  * - Oracle: Linker and Libraries.
+  *   Part No: 817–1984–19, August 2011.
+  *   http://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf
+  *
+  * - System V ABI AMD64 Architecture Processor Supplement
+  *   Draft Version 0.99.4,
+  *   January 13, 2010.
+  *   http://www.cs.washington.edu/education/courses/cse351/12wi/supp-docs/abi.pdf
+  */
+ #define PN_XNUM 0xffff
+ 
+ /* These constants define the different elf file types */
+ #define ET_NONE   0
+ #define ET_REL    1
+ #define ET_EXEC   2
+ #define ET_DYN    3
+ #define ET_CORE   4
+ #define ET_LOPROC 0xff00
+ #define ET_HIPROC 0xffff
+ 
+ /* This is the info that is needed to parse the dynamic section of the file */
+ #define DT_NULL		0
+ #define DT_NEEDED	1
+ #define DT_PLTRELSZ	2
+ #define DT_PLTGOT	3
+ #define DT_HASH		4
+ #define DT_STRTAB	5
+ #define DT_SYMTAB	6
+ #define DT_RELA		7
+ #define DT_RELASZ	8
+ #define DT_RELAENT	9
+ #define DT_STRSZ	10
+ #define DT_SYMENT	11
+ #define DT_INIT		12
+ #define DT_FINI		13
+ #define DT_SONAME	14
+ #define DT_RPATH 	15
+ #define DT_SYMBOLIC	16
+ #define DT_REL	        17
+ #define DT_RELSZ	18
+ #define DT_RELENT	19
+ #define DT_PLTREL	20
+ #define DT_DEBUG	21
+ #define DT_TEXTREL	22
+ #define DT_JMPREL	23
+ #define DT_FLAGS	30
+   #define DF_TEXTREL  0x00000004
+ #define DT_ENCODING	32
+ #define OLD_DT_LOOS	0x60000000
+ #define DT_LOOS		0x6000000d
+ #define DT_HIOS		0x6ffff000
+ #define DT_VALRNGLO	0x6ffffd00
+ #define DT_VALRNGHI	0x6ffffdff
+ #define DT_ADDRRNGLO	0x6ffffe00
+ #define DT_ADDRRNGHI	0x6ffffeff
+ #define DT_VERSYM	0x6ffffff0
+ #define DT_RELACOUNT	0x6ffffff9
+ #define DT_RELCOUNT	0x6ffffffa
+ #define DT_FLAGS_1	0x6ffffffb
+ #define DT_VERDEF	0x6ffffffc
+ #define	DT_VERDEFNUM	0x6ffffffd
+ #define DT_VERNEED	0x6ffffffe
+ #define	DT_VERNEEDNUM	0x6fffffff
+ #define OLD_DT_HIOS     0x6fffffff
+ #define DT_LOPROC	0x70000000
+ #define DT_HIPROC	0x7fffffff
+ 
+ /* This info is needed when parsing the symbol table */
+ #define STB_LOCAL  0
+ #define STB_GLOBAL 1
+ #define STB_WEAK   2
+ 
+ #define STT_NOTYPE  0
+ #define STT_OBJECT  1
+ #define STT_FUNC    2
+ #define STT_SECTION 3
+ #define STT_FILE    4
+ #define STT_COMMON  5
+ #define STT_TLS     6
+ 
+ #define ELF_ST_BIND(x)		((x) >> 4)
+ #define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
+ #define ELF32_ST_BIND(x)	ELF_ST_BIND(x)
+ #define ELF32_ST_TYPE(x)	ELF_ST_TYPE(x)
+ #define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
+ #define ELF64_ST_TYPE(x)	ELF_ST_TYPE(x)
+ 
+ typedef struct dynamic{
+   Elf32_Sword d_tag;
+   union{
+     Elf32_Sword	d_val;
+     Elf32_Addr	d_ptr;
+   } d_un;
+ } Elf32_Dyn;
+ 
+ typedef struct {
+   Elf64_Sxword d_tag;		/* entry tag value */
+   union {
+     Elf64_Xword d_val;
+     Elf64_Addr d_ptr;
+   } d_un;
+ } Elf64_Dyn;
+ 
+ /* The following are used with relocations */
+ #define ELF32_R_SYM(x) ((x) >> 8)
+ #define ELF32_R_TYPE(x) ((x) & 0xff)
+ 
+ #define ELF64_R_SYM(i)			((i) >> 32)
+ #define ELF64_R_TYPE(i)			((i) & 0xffffffff)
+ 
+ typedef struct elf32_rel {
+   Elf32_Addr	r_offset;
+   Elf32_Word	r_info;
+ } Elf32_Rel;
+ 
+ typedef struct elf64_rel {
+   Elf64_Addr r_offset;	/* Location at which to apply the action */
+   Elf64_Xword r_info;	/* index and type of relocation */
+ } Elf64_Rel;
+ 
+ typedef struct elf32_rela{
+   Elf32_Addr	r_offset;
+   Elf32_Word	r_info;
+   Elf32_Sword	r_addend;
+ } Elf32_Rela;
+ 
+ typedef struct elf64_rela {
+   Elf64_Addr r_offset;	/* Location at which to apply the action */
+   Elf64_Xword r_info;	/* index and type of relocation */
+   Elf64_Sxword r_addend;	/* Constant addend used to compute value */
+ } Elf64_Rela;
+ 
+ typedef struct elf32_sym{
+   Elf32_Word	st_name;
+   Elf32_Addr	st_value;
+   Elf32_Word	st_size;
+   unsigned char	st_info;
+   unsigned char	st_other;
+   Elf32_Half	st_shndx;
+ } Elf32_Sym;
+ 
+ typedef struct elf64_sym {
+   Elf64_Word st_name;		/* Symbol name, index in string tbl */
+   unsigned char	st_info;	/* Type and binding attributes */
+   unsigned char	st_other;	/* No defined meaning, 0 */
+   Elf64_Half st_shndx;		/* Associated section index */
+   Elf64_Addr st_value;		/* Value of the symbol */
+   Elf64_Xword st_size;		/* Associated symbol size */
+ } Elf64_Sym;
+ 
+ 
+ #define EI_NIDENT	16
+ 
+ typedef struct elf32_hdr{
+   unsigned char	e_ident[EI_NIDENT];
+   Elf32_Half	e_type;
+   Elf32_Half	e_machine;
+   Elf32_Word	e_version;
+   Elf32_Addr	e_entry;  /* Entry point */
+   Elf32_Off	e_phoff;
+   Elf32_Off	e_shoff;
+   Elf32_Word	e_flags;
+   Elf32_Half	e_ehsize;
+   Elf32_Half	e_phentsize;
+   Elf32_Half	e_phnum;
+   Elf32_Half	e_shentsize;
+   Elf32_Half	e_shnum;
+   Elf32_Half	e_shstrndx;
+ } Elf32_Ehdr;
+ 
+ typedef struct elf64_hdr {
+   unsigned char	e_ident[EI_NIDENT];	/* ELF "magic number" */
+   Elf64_Half e_type;
+   Elf64_Half e_machine;
+   Elf64_Word e_version;
+   Elf64_Addr e_entry;		/* Entry point virtual address */
+   Elf64_Off e_phoff;		/* Program header table file offset */
+   Elf64_Off e_shoff;		/* Section header table file offset */
+   Elf64_Word e_flags;
+   Elf64_Half e_ehsize;
+   Elf64_Half e_phentsize;
+   Elf64_Half e_phnum;
+   Elf64_Half e_shentsize;
+   Elf64_Half e_shnum;
+   Elf64_Half e_shstrndx;
+ } Elf64_Ehdr;
+ 
+ /* These constants define the permissions on sections in the program
+    header, p_flags. */
+ #define PF_R		0x4
+ #define PF_W		0x2
+ #define PF_X		0x1
+ 
+ #define PF_PAGEEXEC	(1U << 4)	/* Enable  PAGEEXEC */
+ #define PF_NOPAGEEXEC	(1U << 5)	/* Disable PAGEEXEC */
+ #define PF_SEGMEXEC	(1U << 6)	/* Enable  SEGMEXEC */
+ #define PF_NOSEGMEXEC	(1U << 7)	/* Disable SEGMEXEC */
+ #define PF_MPROTECT	(1U << 8)	/* Enable  MPROTECT */
+ #define PF_NOMPROTECT	(1U << 9)	/* Disable MPROTECT */
+ #define PF_EMUTRAMP	(1U << 12)	/* Enable  EMUTRAMP */
+ #define PF_NOEMUTRAMP	(1U << 13)	/* Disable EMUTRAMP */
+ #define PF_RANDMMAP	(1U << 14)	/* Enable  RANDMMAP */
+ #define PF_NORANDMMAP	(1U << 15)	/* Disable RANDMMAP */
+ 
+ typedef struct elf32_phdr{
+   Elf32_Word	p_type;
+   Elf32_Off	p_offset;
+   Elf32_Addr	p_vaddr;
+   Elf32_Addr	p_paddr;
+   Elf32_Word	p_filesz;
+   Elf32_Word	p_memsz;
+   Elf32_Word	p_flags;
+   Elf32_Word	p_align;
+ } Elf32_Phdr;
+ 
+ typedef struct elf64_phdr {
+   Elf64_Word p_type;
+   Elf64_Word p_flags;
+   Elf64_Off p_offset;		/* Segment file offset */
+   Elf64_Addr p_vaddr;		/* Segment virtual address */
+   Elf64_Addr p_paddr;		/* Segment physical address */
+   Elf64_Xword p_filesz;		/* Segment size in file */
+   Elf64_Xword p_memsz;		/* Segment size in memory */
+   Elf64_Xword p_align;		/* Segment alignment, file & memory */
+ } Elf64_Phdr;
+ 
+ /* sh_type */
+ #define SHT_NULL	0
+ #define SHT_PROGBITS	1
+ #define SHT_SYMTAB	2
+ #define SHT_STRTAB	3
+ #define SHT_RELA	4
+ #define SHT_HASH	5
+ #define SHT_DYNAMIC	6
+ #define SHT_NOTE	7
+ #define SHT_NOBITS	8
+ #define SHT_REL		9
+ #define SHT_SHLIB	10
+ #define SHT_DYNSYM	11
+ #define SHT_NUM		12
+ #define SHT_LOPROC	0x70000000
+ #define SHT_HIPROC	0x7fffffff
+ #define SHT_LOUSER	0x80000000
+ #define SHT_HIUSER	0xffffffff
+ 
+ /* sh_flags */
+ #define SHF_WRITE		0x1
+ #define SHF_ALLOC		0x2
+ #define SHF_EXECINSTR		0x4
+ #define SHF_RELA_LIVEPATCH	0x00100000
+ #define SHF_RO_AFTER_INIT	0x00200000
+ #define SHF_MASKPROC		0xf0000000
+ 
+ /* special section indexes */
+ #define SHN_UNDEF	0
+ #define SHN_LORESERVE	0xff00
+ #define SHN_LOPROC	0xff00
+ #define SHN_HIPROC	0xff1f
+ #define SHN_LIVEPATCH	0xff20
+ #define SHN_ABS		0xfff1
+ #define SHN_COMMON	0xfff2
+ #define SHN_HIRESERVE	0xffff
+  
+ typedef struct elf32_shdr {
+   Elf32_Word	sh_name;
+   Elf32_Word	sh_type;
+   Elf32_Word	sh_flags;
+   Elf32_Addr	sh_addr;
+   Elf32_Off	sh_offset;
+   Elf32_Word	sh_size;
+   Elf32_Word	sh_link;
+   Elf32_Word	sh_info;
+   Elf32_Word	sh_addralign;
+   Elf32_Word	sh_entsize;
+ } Elf32_Shdr;
+ 
+ typedef struct elf64_shdr {
+   Elf64_Word sh_name;		/* Section name, index in string tbl */
+   Elf64_Word sh_type;		/* Type of section */
+   Elf64_Xword sh_flags;		/* Miscellaneous section attributes */
+   Elf64_Addr sh_addr;		/* Section virtual addr at execution */
+   Elf64_Off sh_offset;		/* Section file offset */
+   Elf64_Xword sh_size;		/* Size of section in bytes */
+   Elf64_Word sh_link;		/* Index of another section */
+   Elf64_Word sh_info;		/* Additional section information */
+   Elf64_Xword sh_addralign;	/* Section alignment */
+   Elf64_Xword sh_entsize;	/* Entry size if section holds table */
+ } Elf64_Shdr;
+ 
+ #define	EI_MAG0		0		/* e_ident[] indexes */
+ #define	EI_MAG1		1
+ #define	EI_MAG2		2
+ #define	EI_MAG3		3
+ #define	EI_CLASS	4
+ #define	EI_DATA		5
+ #define	EI_VERSION	6
+ #define	EI_OSABI	7
+ #define	EI_PAD		8
+ 
+ #define	EI_PAX		14
+ 
+ #define	ELFMAG0		0x7f		/* EI_MAG */
+ #define	ELFMAG1		'E'
+ #define	ELFMAG2		'L'
+ #define	ELFMAG3		'F'
+ #define	ELFMAG		"\177ELF"
+ #define	SELFMAG		4
+ 
+ #define	ELFCLASSNONE	0		/* EI_CLASS */
+ #define	ELFCLASS32	1
+ #define	ELFCLASS64	2
+ #define	ELFCLASSNUM	3
+ 
+ #define ELFDATANONE	0		/* e_ident[EI_DATA] */
+ #define ELFDATA2LSB	1
+ #define ELFDATA2MSB	2
+ 
+ #define EV_NONE		0		/* e_version, EI_VERSION */
+ #define EV_CURRENT	1
+ #define EV_NUM		2
+ 
+ #define ELFOSABI_NONE	0
+ #define ELFOSABI_LINUX	3
+ 
+ #ifndef ELF_OSABI
+ #define ELF_OSABI ELFOSABI_NONE
+ #endif
+ 
+ /*
+  * Notes used in ET_CORE. Architectures export some of the arch register sets
+  * using the corresponding note types via the PTRACE_GETREGSET and
+  * PTRACE_SETREGSET requests.
+  */
+ #define NT_PRSTATUS	1
+ #define NT_PRFPREG	2
+ #define NT_PRPSINFO	3
+ #define NT_TASKSTRUCT	4
+ #define NT_AUXV		6
+ /*
+  * Note to userspace developers: size of NT_SIGINFO note may increase
+  * in the future to accomodate more fields, don't assume it is fixed!
+  */
+ #define NT_SIGINFO      0x53494749
+ #define NT_FILE         0x46494c45
+ #define NT_PRXFPREG     0x46e62b7f      /* copied from gdb5.1/include/elf/common.h */
+ #define NT_PPC_VMX	0x100		/* PowerPC Altivec/VMX registers */
+ #define NT_PPC_SPE	0x101		/* PowerPC SPE/EVR registers */
+ #define NT_PPC_VSX	0x102		/* PowerPC VSX registers */
+ #define NT_PPC_TAR	0x103		/* Target Address Register */
+ #define NT_PPC_PPR	0x104		/* Program Priority Register */
+ #define NT_PPC_DSCR	0x105		/* Data Stream Control Register */
+ #define NT_PPC_EBB	0x106		/* Event Based Branch Registers */
+ #define NT_PPC_PMU	0x107		/* Performance Monitor Registers */
+ #define NT_PPC_TM_CGPR	0x108		/* TM checkpointed GPR Registers */
+ #define NT_PPC_TM_CFPR	0x109		/* TM checkpointed FPR Registers */
+ #define NT_PPC_TM_CVMX	0x10a		/* TM checkpointed VMX Registers */
+ #define NT_PPC_TM_CVSX	0x10b		/* TM checkpointed VSX Registers */
+ #define NT_PPC_TM_SPR	0x10c		/* TM Special Purpose Registers */
+ #define NT_PPC_TM_CTAR	0x10d		/* TM checkpointed Target Address Register */
+ #define NT_PPC_TM_CPPR	0x10e		/* TM checkpointed Program Priority Register */
+ #define NT_PPC_TM_CDSCR	0x10f		/* TM checkpointed Data Stream Control Register */
+ #define NT_PPC_PKEY	0x110		/* Memory Protection Keys registers */
+ #define NT_386_TLS	0x200		/* i386 TLS slots (struct user_desc) */
+ #define NT_386_IOPERM	0x201		/* x86 io permission bitmap (1=deny) */
+ #define NT_X86_XSTATE	0x202		/* x86 extended state using xsave */
+ #define NT_S390_HIGH_GPRS	0x300	/* s390 upper register halves */
+ #define NT_S390_TIMER	0x301		/* s390 timer register */
+ #define NT_S390_TODCMP	0x302		/* s390 TOD clock comparator register */
+ #define NT_S390_TODPREG	0x303		/* s390 TOD programmable register */
+ #define NT_S390_CTRS	0x304		/* s390 control registers */
+ #define NT_S390_PREFIX	0x305		/* s390 prefix register */
+ #define NT_S390_LAST_BREAK	0x306	/* s390 breaking event address */
+ #define NT_S390_SYSTEM_CALL	0x307	/* s390 system call restart data */
+ #define NT_S390_TDB	0x308		/* s390 transaction diagnostic block */
+ #define NT_S390_VXRS_LOW	0x309	/* s390 vector registers 0-15 upper half */
+ #define NT_S390_VXRS_HIGH	0x30a	/* s390 vector registers 16-31 */
+ #define NT_S390_GS_CB	0x30b		/* s390 guarded storage registers */
+ #define NT_S390_GS_BC	0x30c		/* s390 guarded storage broadcast control block */
+ #define NT_S390_RI_CB	0x30d		/* s390 runtime instrumentation */
+ #define NT_ARM_VFP	0x400		/* ARM VFP/NEON registers */
+ #define NT_ARM_TLS	0x401		/* ARM TLS register */
+ #define NT_ARM_HW_BREAK	0x402		/* ARM hardware breakpoint registers */
+ #define NT_ARM_HW_WATCH	0x403		/* ARM hardware watchpoint registers */
+ #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
+ #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
+ #define NT_ARM_PAC_MASK		0x406	/* ARM pointer authentication code masks */
+ #define NT_ARM_PACA_KEYS	0x407	/* ARM pointer authentication address keys */
+ #define NT_ARM_PACG_KEYS	0x408	/* ARM pointer authentication generic key */
+ #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
+ #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
+ #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */
+ #define NT_MIPS_FP_MODE	0x801		/* MIPS floating-point mode */
+ #define NT_MIPS_MSA	0x802		/* MIPS SIMD registers */
+ 
+ /* Note header in a PT_NOTE section */
+ typedef struct elf32_note {
+   Elf32_Word	n_namesz;	/* Name size */
+   Elf32_Word	n_descsz;	/* Content size */
+   Elf32_Word	n_type;		/* Content type */
+ } Elf32_Nhdr;
+ 
+ /* Note header in a PT_NOTE section */
+ typedef struct elf64_note {
+   Elf64_Word n_namesz;	/* Name size */
+   Elf64_Word n_descsz;	/* Content size */
+   Elf64_Word n_type;	/* Content type */
+ } Elf64_Nhdr;
+ 
+ #endif /* _UAPI_LINUX_ELF_H */
diff --color -rcNP Master/include/uapi/linux/elf.h.rej OG/include/uapi/linux/elf.h.rej
*** Master/include/uapi/linux/elf.h.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/include/uapi/linux/elf.h.rej	2021-04-20 15:11:27.323000000 -0400
***************
*** 0 ****
--- 1,19 ----
+ *** include/uapi/linux/elf.h	2021-03-13 15:44:26.000000000 +0200
+ --- include/uapi/linux/elf.h	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 299,305 ****
+   #define SHN_ABS		0xfff1
+   #define SHN_COMMON	0xfff2
+   #define SHN_HIRESERVE	0xffff
+ ! 
+   typedef struct elf32_shdr {
+     Elf32_Word	sh_name;
+     Elf32_Word	sh_type;
+ --- 278,284 ----
+   #define SHN_ABS		0xfff1
+   #define SHN_COMMON	0xfff2
+   #define SHN_HIRESERVE	0xffff
+ ! 
+   typedef struct elf32_shdr {
+     Elf32_Word	sh_name;
+     Elf32_Word	sh_type;
diff --color -rcNP Master/include/uapi/linux/xattr.h OG/include/uapi/linux/xattr.h
*** Master/include/uapi/linux/xattr.h	2021-04-20 14:17:31.000000000 -0400
--- OG/include/uapi/linux/xattr.h	2021-04-20 15:11:34.515000000 -0400
***************
*** 77,81 ****
--- 77,86 ----
  #define XATTR_POSIX_ACL_DEFAULT  "posix_acl_default"
  #define XATTR_NAME_POSIX_ACL_DEFAULT XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_DEFAULT
  
+ /* User namespace */
+ #define XATTR_PAX_PREFIX "pax."
+ #define XATTR_PAX_FLAGS_SUFFIX "flags"
+ #define XATTR_NAME_USER_PAX_FLAGS XATTR_USER_PREFIX XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX
+ #define XATTR_NAME_PAX_FLAGS XATTR_PAX_PREFIX XATTR_PAX_FLAGS_SUFFIX
  
  #endif /* _UAPI_LINUX_XATTR_H */
diff --color -rcNP Master/init/main.c OG/init/main.c
*** Master/init/main.c	2021-04-20 14:17:31.000000000 -0400
--- OG/init/main.c	2021-04-20 15:11:34.516000000 -0400
***************
*** 109,114 ****
--- 109,116 ----
  extern void init_IRQ(void);
  extern void radix_tree_init(void);
  
+ extern void minisec_init(void);
+ 
  /*
   * Debug helper: via this flag we know that we are in 'early bootup code'
   * where only the boot processor is running with IRQ disabled.  This means
***************
*** 170,175 ****
--- 172,208 ----
  
  __setup("reset_devices", set_reset_devices);
  
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+ kgid_t grsec_proc_gid = KGIDT_INIT(CONFIG_MINISEC_PROC_GID);
+ static int __init setup_grsec_proc_gid(char *str)
+ {
+ 	grsec_proc_gid = KGIDT_INIT(simple_strtol(str, NULL, 0));
+ 	return 1;
+ }
+ __setup("grsec_proc_gid=", setup_grsec_proc_gid);
+ #endif
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ int grsec_enable_sysfs_restrict = 1;
+ static int __init setup_grsec_sysfs_restrict(char *str)
+ {
+ 	if (!simple_strtol(str, NULL, 0))
+ 		grsec_enable_sysfs_restrict = 0;
+ 	return 1;
+ }
+ __setup("grsec_sysfs_restrict", setup_grsec_sysfs_restrict);
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ int pax_softmode;
+ 
+ static int __init setup_pax_softmode(char *str)
+ {
+ 	get_option(&str, &pax_softmode);
+ 	return 1;
+ }
+ __setup("pax_softmode=", setup_pax_softmode);
+ #endif
+ 
  static const char *argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
  const char *envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
  static const char *panic_later, *panic_param;
***************
*** 1214,1219 ****
--- 1247,1254 ----
  		prepare_namespace();
  	}
  
+ 	minisec_init();
+ 
  	/*
  	 * Ok, we have completed the initial bootup, and
  	 * we're essentially up and running. Get rid of the
diff --color -rcNP Master/ipc/shm.c OG/ipc/shm.c
*** Master/ipc/shm.c	2021-04-20 14:17:31.000000000 -0400
--- OG/ipc/shm.c	2021-04-20 15:11:34.516000000 -0400
***************
*** 1468,1473 ****
--- 1468,1478 ----
  		f_flags = O_RDWR;
  	}
  	if (shmflg & SHM_EXEC) {
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 		if (current->mm->pax_flags & MF_PAX_MPROTECT)
+ 			goto out;
+ #endif
+ 
  		prot |= PROT_EXEC;
  		acc_mode |= S_IXUGO;
  	}
diff --color -rcNP Master/ipc/util.c OG/ipc/util.c
*** Master/ipc/util.c	2021-04-20 14:17:31.000000000 -0400
--- OG/ipc/util.c	2021-04-20 15:11:34.516000000 -0400
***************
*** 78,83 ****
--- 78,85 ----
  
  extern int ipc_permitted(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, int requested_mode, int granted_mode);
  
+ extern int gr_ipc_permitted(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, int requested_mode, int granted_mode);
+ 
  /**
   * ipc_init - initialise ipc subsystem
   *
***************
*** 535,540 ****
--- 537,545 ----
  	if (!ipc_permitted(ns, ipcp, requested_mode, granted_mode))
  		return -1;
  
+ 	if (!gr_ipc_permitted(ns, ipcp, requested_mode, granted_mode))
+ 		return -1;
+ 
  	/* is there some bit set in requested_mode but not in granted_mode? */
  	if ((requested_mode & ~granted_mode & 0007) &&
  	    !ns_capable(ns->user_ns, CAP_IPC_OWNER))
diff --color -rcNP Master/kernel/configs.c OG/kernel/configs.c
*** Master/kernel/configs.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/configs.c	2021-04-20 15:11:34.516000000 -0400
***************
*** 58,65 ****
--- 58,75 ----
  	struct proc_dir_entry *entry;
  
  	/* create the current config file */
+ #if defined(CONFIG_MINISEC_PROC_ADD) || defined(CONFIG_MINISEC_HIDESYM)
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_HIDESYM)
+ 	entry = proc_create("config.gz", S_IFREG | S_IRUSR, NULL,
+ 			    &ikconfig_file_ops);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	entry = proc_create("config.gz", S_IFREG | S_IRUSR | S_IRGRP, NULL,
+ 			    &ikconfig_file_ops);
+ #endif
+ #else
  	entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL,
  			    &ikconfig_file_ops);
+ #endif
  	if (!entry)
  		return -ENOMEM;
  
diff --color -rcNP Master/kernel/fork.c OG/kernel/fork.c
*** Master/kernel/fork.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/fork.c	2021-04-20 15:11:34.517000000 -0400
***************
*** 914,920 ****
  	set_task_stack_end_magic(tsk);
  
  #ifdef CONFIG_STACKPROTECTOR
! 	tsk->stack_canary = get_random_canary();
  #endif
  	if (orig->cpus_ptr == &orig->cpus_mask)
  		tsk->cpus_ptr = &tsk->cpus_mask;
--- 914,920 ----
  	set_task_stack_end_magic(tsk);
  
  #ifdef CONFIG_STACKPROTECTOR
! 	tsk->stack_canary = pax_get_random_long();
  #endif
  	if (orig->cpus_ptr == &orig->cpus_mask)
  		tsk->cpus_ptr = &tsk->cpus_mask;
diff --color -rcNP Master/kernel/module.c OG/kernel/module.c
*** Master/kernel/module.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/module.c	2021-04-20 15:11:34.517000000 -0400
***************
*** 4545,4551 ****
--- 4545,4561 ----
  
  static int __init proc_modules_init(void)
  {
+ #ifndef CONFIG_MINISEC_HIDESYM
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	proc_create("modules", S_IRUSR, NULL, &modules_proc_ops);
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	proc_create("modules", S_IRUSR | S_IRGRP, NULL, &modules_proc_ops);
+ #else
  	proc_create("modules", 0, NULL, &proc_modules_operations);
+ #endif
+ #else
+ 	proc_create("modules", S_IRUSR, NULL, &modules_proc_ops);
+ #endif
  	return 0;
  }
  module_init(proc_modules_init);
diff --color -rcNP Master/kernel/module.c.orig OG/kernel/module.c.orig
*** Master/kernel/module.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/kernel/module.c.orig	2021-04-20 15:11:27.325000000 -0400
***************
*** 0 ****
--- 1,4695 ----
+ // SPDX-License-Identifier: GPL-2.0-or-later
+ /*
+    Copyright (C) 2002 Richard Henderson
+    Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ 
+ */
+ #include <linux/export.h>
+ #include <linux/extable.h>
+ #include <linux/moduleloader.h>
+ #include <linux/module_signature.h>
+ #include <linux/trace_events.h>
+ #include <linux/init.h>
+ #include <linux/kallsyms.h>
+ #include <linux/file.h>
+ #include <linux/fs.h>
+ #include <linux/sysfs.h>
+ #include <linux/kernel.h>
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+ #include <linux/elf.h>
+ #include <linux/proc_fs.h>
+ #include <linux/security.h>
+ #include <linux/seq_file.h>
+ #include <linux/syscalls.h>
+ #include <linux/fcntl.h>
+ #include <linux/rcupdate.h>
+ #include <linux/capability.h>
+ #include <linux/cpu.h>
+ #include <linux/moduleparam.h>
+ #include <linux/errno.h>
+ #include <linux/err.h>
+ #include <linux/vermagic.h>
+ #include <linux/notifier.h>
+ #include <linux/sched.h>
+ #include <linux/device.h>
+ #include <linux/string.h>
+ #include <linux/mutex.h>
+ #include <linux/rculist.h>
+ #include <linux/uaccess.h>
+ #include <asm/cacheflush.h>
+ #include <linux/set_memory.h>
+ #include <asm/mmu_context.h>
+ #include <linux/license.h>
+ #include <asm/sections.h>
+ #include <linux/tracepoint.h>
+ #include <linux/ftrace.h>
+ #include <linux/livepatch.h>
+ #include <linux/async.h>
+ #include <linux/percpu.h>
+ #include <linux/kmemleak.h>
+ #include <linux/jump_label.h>
+ #include <linux/pfn.h>
+ #include <linux/bsearch.h>
+ #include <linux/dynamic_debug.h>
+ #include <linux/audit.h>
+ #include <uapi/linux/module.h>
+ #include "module-internal.h"
+ 
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/module.h>
+ 
+ #ifndef ARCH_SHF_SMALL
+ #define ARCH_SHF_SMALL 0
+ #endif
+ 
+ /*
+  * Modules' sections will be aligned on page boundaries
+  * to ensure complete separation of code and data, but
+  * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
+  */
+ #ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+ # define debug_align(X) ALIGN(X, PAGE_SIZE)
+ #else
+ # define debug_align(X) (X)
+ #endif
+ 
+ /* If this is set, the section belongs in the init part of the module */
+ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
+ 
+ /*
+  * Mutex protects:
+  * 1) List of modules (also safely readable with preempt_disable),
+  * 2) module_use links,
+  * 3) module_addr_min/module_addr_max.
+  * (delete and add uses RCU list operations). */
+ DEFINE_MUTEX(module_mutex);
+ EXPORT_SYMBOL_GPL(module_mutex);
+ static LIST_HEAD(modules);
+ 
+ /* Work queue for freeing init sections in success case */
+ static void do_free_init(struct work_struct *w);
+ static DECLARE_WORK(init_free_wq, do_free_init);
+ static LLIST_HEAD(init_free_list);
+ 
+ #ifdef CONFIG_MODULES_TREE_LOOKUP
+ 
+ /*
+  * Use a latched RB-tree for __module_address(); this allows us to use
+  * RCU-sched lookups of the address from any context.
+  *
+  * This is conditional on PERF_EVENTS || TRACING because those can really hit
+  * __module_address() hard by doing a lot of stack unwinding; potentially from
+  * NMI context.
+  */
+ 
+ static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+ {
+ 	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+ 
+ 	return (unsigned long)layout->base;
+ }
+ 
+ static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+ {
+ 	struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+ 
+ 	return (unsigned long)layout->size;
+ }
+ 
+ static __always_inline bool
+ mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+ {
+ 	return __mod_tree_val(a) < __mod_tree_val(b);
+ }
+ 
+ static __always_inline int
+ mod_tree_comp(void *key, struct latch_tree_node *n)
+ {
+ 	unsigned long val = (unsigned long)key;
+ 	unsigned long start, end;
+ 
+ 	start = __mod_tree_val(n);
+ 	if (val < start)
+ 		return -1;
+ 
+ 	end = start + __mod_tree_size(n);
+ 	if (val >= end)
+ 		return 1;
+ 
+ 	return 0;
+ }
+ 
+ static const struct latch_tree_ops mod_tree_ops = {
+ 	.less = mod_tree_less,
+ 	.comp = mod_tree_comp,
+ };
+ 
+ static struct mod_tree_root {
+ 	struct latch_tree_root root;
+ 	unsigned long addr_min;
+ 	unsigned long addr_max;
+ } mod_tree __cacheline_aligned = {
+ 	.addr_min = -1UL,
+ };
+ 
+ #define module_addr_min mod_tree.addr_min
+ #define module_addr_max mod_tree.addr_max
+ 
+ static noinline void __mod_tree_insert(struct mod_tree_node *node)
+ {
+ 	latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+ }
+ 
+ static void __mod_tree_remove(struct mod_tree_node *node)
+ {
+ 	latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+ }
+ 
+ /*
+  * These modifications: insert, remove_init and remove; are serialized by the
+  * module_mutex.
+  */
+ static void mod_tree_insert(struct module *mod)
+ {
+ 	mod->core_layout.mtn.mod = mod;
+ 	mod->init_layout.mtn.mod = mod;
+ 
+ 	__mod_tree_insert(&mod->core_layout.mtn);
+ 	if (mod->init_layout.size)
+ 		__mod_tree_insert(&mod->init_layout.mtn);
+ }
+ 
+ static void mod_tree_remove_init(struct module *mod)
+ {
+ 	if (mod->init_layout.size)
+ 		__mod_tree_remove(&mod->init_layout.mtn);
+ }
+ 
+ static void mod_tree_remove(struct module *mod)
+ {
+ 	__mod_tree_remove(&mod->core_layout.mtn);
+ 	mod_tree_remove_init(mod);
+ }
+ 
+ static struct module *mod_find(unsigned long addr)
+ {
+ 	struct latch_tree_node *ltn;
+ 
+ 	ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
+ 	if (!ltn)
+ 		return NULL;
+ 
+ 	return container_of(ltn, struct mod_tree_node, node)->mod;
+ }
+ 
+ #else /* MODULES_TREE_LOOKUP */
+ 
+ static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+ 
+ static void mod_tree_insert(struct module *mod) { }
+ static void mod_tree_remove_init(struct module *mod) { }
+ static void mod_tree_remove(struct module *mod) { }
+ 
+ static struct module *mod_find(unsigned long addr)
+ {
+ 	struct module *mod;
+ 
+ 	list_for_each_entry_rcu(mod, &modules, list,
+ 				lockdep_is_held(&module_mutex)) {
+ 		if (within_module(addr, mod))
+ 			return mod;
+ 	}
+ 
+ 	return NULL;
+ }
+ 
+ #endif /* MODULES_TREE_LOOKUP */
+ 
+ /*
+  * Bounds of module text, for speeding up __module_address.
+  * Protected by module_mutex.
+  */
+ static void __mod_update_bounds(void *base, unsigned int size)
+ {
+ 	unsigned long min = (unsigned long)base;
+ 	unsigned long max = min + size;
+ 
+ 	if (min < module_addr_min)
+ 		module_addr_min = min;
+ 	if (max > module_addr_max)
+ 		module_addr_max = max;
+ }
+ 
+ static void mod_update_bounds(struct module *mod)
+ {
+ 	__mod_update_bounds(mod->core_layout.base, mod->core_layout.size);
+ 	if (mod->init_layout.size)
+ 		__mod_update_bounds(mod->init_layout.base, mod->init_layout.size);
+ }
+ 
+ #ifdef CONFIG_KGDB_KDB
+ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+ #endif /* CONFIG_KGDB_KDB */
+ 
+ static void module_assert_mutex(void)
+ {
+ 	lockdep_assert_held(&module_mutex);
+ }
+ 
+ static void module_assert_mutex_or_preempt(void)
+ {
+ #ifdef CONFIG_LOCKDEP
+ 	if (unlikely(!debug_locks))
+ 		return;
+ 
+ 	WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
+ 		!lockdep_is_held(&module_mutex));
+ #endif
+ }
+ 
+ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
+ module_param(sig_enforce, bool_enable_only, 0644);
+ 
+ /*
+  * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+  * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+  */
+ bool is_module_sig_enforced(void)
+ {
+ 	return sig_enforce;
+ }
+ EXPORT_SYMBOL(is_module_sig_enforced);
+ 
+ void set_module_sig_enforced(void)
+ {
+ 	sig_enforce = true;
+ }
+ 
+ /* Block module loading/unloading? */
+ int modules_disabled = 0;
+ core_param(nomodule, modules_disabled, bint, 0);
+ 
+ /* Waiting for a module to finish initializing? */
+ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
+ 
+ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+ 
+ int register_module_notifier(struct notifier_block *nb)
+ {
+ 	return blocking_notifier_chain_register(&module_notify_list, nb);
+ }
+ EXPORT_SYMBOL(register_module_notifier);
+ 
+ int unregister_module_notifier(struct notifier_block *nb)
+ {
+ 	return blocking_notifier_chain_unregister(&module_notify_list, nb);
+ }
+ EXPORT_SYMBOL(unregister_module_notifier);
+ 
+ /*
+  * We require a truly strong try_module_get(): 0 means success.
+  * Otherwise an error is returned due to ongoing or failed
+  * initialization etc.
+  */
+ static inline int strong_try_module_get(struct module *mod)
+ {
+ 	BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
+ 	if (mod && mod->state == MODULE_STATE_COMING)
+ 		return -EBUSY;
+ 	if (try_module_get(mod))
+ 		return 0;
+ 	else
+ 		return -ENOENT;
+ }
+ 
+ static inline void add_taint_module(struct module *mod, unsigned flag,
+ 				    enum lockdep_ok lockdep_ok)
+ {
+ 	add_taint(flag, lockdep_ok);
+ 	set_bit(flag, &mod->taints);
+ }
+ 
+ /*
+  * A thread that wants to hold a reference to a module only while it
+  * is running can call this to safely exit.  nfsd and lockd use this.
+  */
+ void __noreturn __module_put_and_exit(struct module *mod, long code)
+ {
+ 	module_put(mod);
+ 	do_exit(code);
+ }
+ EXPORT_SYMBOL(__module_put_and_exit);
+ 
+ /* Find a module section: 0 means not found. */
+ static unsigned int find_sec(const struct load_info *info, const char *name)
+ {
+ 	unsigned int i;
+ 
+ 	for (i = 1; i < info->hdr->e_shnum; i++) {
+ 		Elf_Shdr *shdr = &info->sechdrs[i];
+ 		/* Alloc bit cleared means "ignore it." */
+ 		if ((shdr->sh_flags & SHF_ALLOC)
+ 		    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
+ 			return i;
+ 	}
+ 	return 0;
+ }
+ 
+ /* Find a module section, or NULL. */
+ static void *section_addr(const struct load_info *info, const char *name)
+ {
+ 	/* Section 0 has sh_addr 0. */
+ 	return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
+ }
+ 
+ /* Find a module section, or NULL.  Fill in number of "objects" in section. */
+ static void *section_objs(const struct load_info *info,
+ 			  const char *name,
+ 			  size_t object_size,
+ 			  unsigned int *num)
+ {
+ 	unsigned int sec = find_sec(info, name);
+ 
+ 	/* Section 0 has sh_addr 0 and sh_size 0. */
+ 	*num = info->sechdrs[sec].sh_size / object_size;
+ 	return (void *)info->sechdrs[sec].sh_addr;
+ }
+ 
+ /* Provided by the linker */
+ extern const struct kernel_symbol __start___ksymtab[];
+ extern const struct kernel_symbol __stop___ksymtab[];
+ extern const struct kernel_symbol __start___ksymtab_gpl[];
+ extern const struct kernel_symbol __stop___ksymtab_gpl[];
+ extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+ extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
+ extern const s32 __start___kcrctab[];
+ extern const s32 __start___kcrctab_gpl[];
+ extern const s32 __start___kcrctab_gpl_future[];
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ extern const struct kernel_symbol __start___ksymtab_unused[];
+ extern const struct kernel_symbol __stop___ksymtab_unused[];
+ extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+ extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
+ extern const s32 __start___kcrctab_unused[];
+ extern const s32 __start___kcrctab_unused_gpl[];
+ #endif
+ 
+ #ifndef CONFIG_MODVERSIONS
+ #define symversion(base, idx) NULL
+ #else
+ #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
+ #endif
+ 
+ static bool each_symbol_in_section(const struct symsearch *arr,
+ 				   unsigned int arrsize,
+ 				   struct module *owner,
+ 				   bool (*fn)(const struct symsearch *syms,
+ 					      struct module *owner,
+ 					      void *data),
+ 				   void *data)
+ {
+ 	unsigned int j;
+ 
+ 	for (j = 0; j < arrsize; j++) {
+ 		if (fn(&arr[j], owner, data))
+ 			return true;
+ 	}
+ 
+ 	return false;
+ }
+ 
+ /* Returns true as soon as fn returns true, otherwise false. */
+ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+ 				    struct module *owner,
+ 				    void *data),
+ 			 void *data)
+ {
+ 	struct module *mod;
+ 	static const struct symsearch arr[] = {
+ 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
+ 		  NOT_GPL_ONLY, false },
+ 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
+ 		  __start___kcrctab_gpl,
+ 		  GPL_ONLY, false },
+ 		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
+ 		  __start___kcrctab_gpl_future,
+ 		  WILL_BE_GPL_ONLY, false },
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
+ 		  __start___kcrctab_unused,
+ 		  NOT_GPL_ONLY, true },
+ 		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
+ 		  __start___kcrctab_unused_gpl,
+ 		  GPL_ONLY, true },
+ #endif
+ 	};
+ 
+ 	module_assert_mutex_or_preempt();
+ 
+ 	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
+ 		return true;
+ 
+ 	list_for_each_entry_rcu(mod, &modules, list,
+ 				lockdep_is_held(&module_mutex)) {
+ 		struct symsearch arr[] = {
+ 			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
+ 			  NOT_GPL_ONLY, false },
+ 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
+ 			  mod->gpl_crcs,
+ 			  GPL_ONLY, false },
+ 			{ mod->gpl_future_syms,
+ 			  mod->gpl_future_syms + mod->num_gpl_future_syms,
+ 			  mod->gpl_future_crcs,
+ 			  WILL_BE_GPL_ONLY, false },
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ 			{ mod->unused_syms,
+ 			  mod->unused_syms + mod->num_unused_syms,
+ 			  mod->unused_crcs,
+ 			  NOT_GPL_ONLY, true },
+ 			{ mod->unused_gpl_syms,
+ 			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
+ 			  mod->unused_gpl_crcs,
+ 			  GPL_ONLY, true },
+ #endif
+ 		};
+ 
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 
+ 		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
+ 			return true;
+ 	}
+ 	return false;
+ }
+ EXPORT_SYMBOL_GPL(each_symbol_section);
+ 
+ struct find_symbol_arg {
+ 	/* Input */
+ 	const char *name;
+ 	bool gplok;
+ 	bool warn;
+ 
+ 	/* Output */
+ 	struct module *owner;
+ 	const s32 *crc;
+ 	const struct kernel_symbol *sym;
+ };
+ 
+ static bool check_exported_symbol(const struct symsearch *syms,
+ 				  struct module *owner,
+ 				  unsigned int symnum, void *data)
+ {
+ 	struct find_symbol_arg *fsa = data;
+ 
+ 	if (!fsa->gplok) {
+ 		if (syms->licence == GPL_ONLY)
+ 			return false;
+ 		if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+ 			pr_warn("Symbol %s is being used by a non-GPL module, "
+ 				"which will not be allowed in the future\n",
+ 				fsa->name);
+ 		}
+ 	}
+ 
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ 	if (syms->unused && fsa->warn) {
+ 		pr_warn("Symbol %s is marked as UNUSED, however this module is "
+ 			"using it.\n", fsa->name);
+ 		pr_warn("This symbol will go away in the future.\n");
+ 		pr_warn("Please evaluate if this is the right api to use and "
+ 			"if it really is, submit a report to the linux kernel "
+ 			"mailing list together with submitting your code for "
+ 			"inclusion.\n");
+ 	}
+ #endif
+ 
+ 	fsa->owner = owner;
+ 	fsa->crc = symversion(syms->crcs, symnum);
+ 	fsa->sym = &syms->start[symnum];
+ 	return true;
+ }
+ 
+ static unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
+ {
+ #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ 	return (unsigned long)offset_to_ptr(&sym->value_offset);
+ #else
+ 	return sym->value;
+ #endif
+ }
+ 
+ static const char *kernel_symbol_name(const struct kernel_symbol *sym)
+ {
+ #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ 	return offset_to_ptr(&sym->name_offset);
+ #else
+ 	return sym->name;
+ #endif
+ }
+ 
+ static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
+ {
+ #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ 	if (!sym->namespace_offset)
+ 		return NULL;
+ 	return offset_to_ptr(&sym->namespace_offset);
+ #else
+ 	return sym->namespace;
+ #endif
+ }
+ 
+ static int cmp_name(const void *name, const void *sym)
+ {
+ 	return strcmp(name, kernel_symbol_name(sym));
+ }
+ 
+ static bool find_exported_symbol_in_section(const struct symsearch *syms,
+ 					    struct module *owner,
+ 					    void *data)
+ {
+ 	struct find_symbol_arg *fsa = data;
+ 	struct kernel_symbol *sym;
+ 
+ 	sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+ 			sizeof(struct kernel_symbol), cmp_name);
+ 
+ 	if (sym != NULL && check_exported_symbol(syms, owner,
+ 						 sym - syms->start, data))
+ 		return true;
+ 
+ 	return false;
+ }
+ 
+ /* Find an exported symbol and return it, along with, (optional) crc and
+  * (optional) module which owns it.  Needs preempt disabled or module_mutex. */
+ const struct kernel_symbol *find_symbol(const char *name,
+ 					struct module **owner,
+ 					const s32 **crc,
+ 					bool gplok,
+ 					bool warn)
+ {
+ 	struct find_symbol_arg fsa;
+ 
+ 	fsa.name = name;
+ 	fsa.gplok = gplok;
+ 	fsa.warn = warn;
+ 
+ 	if (each_symbol_section(find_exported_symbol_in_section, &fsa)) {
+ 		if (owner)
+ 			*owner = fsa.owner;
+ 		if (crc)
+ 			*crc = fsa.crc;
+ 		return fsa.sym;
+ 	}
+ 
+ 	pr_debug("Failed to find symbol %s\n", name);
+ 	return NULL;
+ }
+ EXPORT_SYMBOL_GPL(find_symbol);
+ 
+ /*
+  * Search for module by name: must hold module_mutex (or preempt disabled
+  * for read-only access).
+  */
+ static struct module *find_module_all(const char *name, size_t len,
+ 				      bool even_unformed)
+ {
+ 	struct module *mod;
+ 
+ 	module_assert_mutex_or_preempt();
+ 
+ 	list_for_each_entry_rcu(mod, &modules, list,
+ 				lockdep_is_held(&module_mutex)) {
+ 		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
+ 			return mod;
+ 	}
+ 	return NULL;
+ }
+ 
+ struct module *find_module(const char *name)
+ {
+ 	module_assert_mutex();
+ 	return find_module_all(name, strlen(name), false);
+ }
+ EXPORT_SYMBOL_GPL(find_module);
+ 
+ #ifdef CONFIG_SMP
+ 
+ static inline void __percpu *mod_percpu(struct module *mod)
+ {
+ 	return mod->percpu;
+ }
+ 
+ static int percpu_modalloc(struct module *mod, struct load_info *info)
+ {
+ 	Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ 	unsigned long align = pcpusec->sh_addralign;
+ 
+ 	if (!pcpusec->sh_size)
+ 		return 0;
+ 
+ 	if (align > PAGE_SIZE) {
+ 		pr_warn("%s: per-cpu alignment %li > %li\n",
+ 			mod->name, align, PAGE_SIZE);
+ 		align = PAGE_SIZE;
+ 	}
+ 
+ 	mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
+ 	if (!mod->percpu) {
+ 		pr_warn("%s: Could not allocate %lu bytes percpu data\n",
+ 			mod->name, (unsigned long)pcpusec->sh_size);
+ 		return -ENOMEM;
+ 	}
+ 	mod->percpu_size = pcpusec->sh_size;
+ 	return 0;
+ }
+ 
+ static void percpu_modfree(struct module *mod)
+ {
+ 	free_percpu(mod->percpu);
+ }
+ 
+ static unsigned int find_pcpusec(struct load_info *info)
+ {
+ 	return find_sec(info, ".data..percpu");
+ }
+ 
+ static void percpu_modcopy(struct module *mod,
+ 			   const void *from, unsigned long size)
+ {
+ 	int cpu;
+ 
+ 	for_each_possible_cpu(cpu)
+ 		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+ }
+ 
+ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+ {
+ 	struct module *mod;
+ 	unsigned int cpu;
+ 
+ 	preempt_disable();
+ 
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		if (!mod->percpu_size)
+ 			continue;
+ 		for_each_possible_cpu(cpu) {
+ 			void *start = per_cpu_ptr(mod->percpu, cpu);
+ 			void *va = (void *)addr;
+ 
+ 			if (va >= start && va < start + mod->percpu_size) {
+ 				if (can_addr) {
+ 					*can_addr = (unsigned long) (va - start);
+ 					*can_addr += (unsigned long)
+ 						per_cpu_ptr(mod->percpu,
+ 							    get_boot_cpu_id());
+ 				}
+ 				preempt_enable();
+ 				return true;
+ 			}
+ 		}
+ 	}
+ 
+ 	preempt_enable();
+ 	return false;
+ }
+ 
+ /**
+  * is_module_percpu_address - test whether address is from module static percpu
+  * @addr: address to test
+  *
+  * Test whether @addr belongs to module static percpu area.
+  *
+  * RETURNS:
+  * %true if @addr is from module static percpu area
+  */
+ bool is_module_percpu_address(unsigned long addr)
+ {
+ 	return __is_module_percpu_address(addr, NULL);
+ }
+ 
+ #else /* ... !CONFIG_SMP */
+ 
+ static inline void __percpu *mod_percpu(struct module *mod)
+ {
+ 	return NULL;
+ }
+ static int percpu_modalloc(struct module *mod, struct load_info *info)
+ {
+ 	/* UP modules shouldn't have this section: ENOMEM isn't quite right */
+ 	if (info->sechdrs[info->index.pcpu].sh_size != 0)
+ 		return -ENOMEM;
+ 	return 0;
+ }
+ static inline void percpu_modfree(struct module *mod)
+ {
+ }
+ static unsigned int find_pcpusec(struct load_info *info)
+ {
+ 	return 0;
+ }
+ static inline void percpu_modcopy(struct module *mod,
+ 				  const void *from, unsigned long size)
+ {
+ 	/* pcpusec should be 0, and size of that section should be 0. */
+ 	BUG_ON(size != 0);
+ }
+ bool is_module_percpu_address(unsigned long addr)
+ {
+ 	return false;
+ }
+ 
+ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+ {
+ 	return false;
+ }
+ 
+ #endif /* CONFIG_SMP */
+ 
+ #define MODINFO_ATTR(field)	\
+ static void setup_modinfo_##field(struct module *mod, const char *s)  \
+ {                                                                     \
+ 	mod->field = kstrdup(s, GFP_KERNEL);                          \
+ }                                                                     \
+ static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
+ 			struct module_kobject *mk, char *buffer)      \
+ {                                                                     \
+ 	return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
+ }                                                                     \
+ static int modinfo_##field##_exists(struct module *mod)               \
+ {                                                                     \
+ 	return mod->field != NULL;                                    \
+ }                                                                     \
+ static void free_modinfo_##field(struct module *mod)                  \
+ {                                                                     \
+ 	kfree(mod->field);                                            \
+ 	mod->field = NULL;                                            \
+ }                                                                     \
+ static struct module_attribute modinfo_##field = {                    \
+ 	.attr = { .name = __stringify(field), .mode = 0444 },         \
+ 	.show = show_modinfo_##field,                                 \
+ 	.setup = setup_modinfo_##field,                               \
+ 	.test = modinfo_##field##_exists,                             \
+ 	.free = free_modinfo_##field,                                 \
+ };
+ 
+ MODINFO_ATTR(version);
+ MODINFO_ATTR(srcversion);
+ 
+ static char last_unloaded_module[MODULE_NAME_LEN+1];
+ 
+ #ifdef CONFIG_MODULE_UNLOAD
+ 
+ EXPORT_TRACEPOINT_SYMBOL(module_get);
+ 
+ /* MODULE_REF_BASE is the base reference count by kmodule loader. */
+ #define MODULE_REF_BASE	1
+ 
+ /* Init the unload section of the module. */
+ static int module_unload_init(struct module *mod)
+ {
+ 	/*
+ 	 * Initialize reference counter to MODULE_REF_BASE.
+ 	 * refcnt == 0 means module is going.
+ 	 */
+ 	atomic_set(&mod->refcnt, MODULE_REF_BASE);
+ 
+ 	INIT_LIST_HEAD(&mod->source_list);
+ 	INIT_LIST_HEAD(&mod->target_list);
+ 
+ 	/* Hold reference count during initialization. */
+ 	atomic_inc(&mod->refcnt);
+ 
+ 	return 0;
+ }
+ 
+ /* Does a already use b? */
+ static int already_uses(struct module *a, struct module *b)
+ {
+ 	struct module_use *use;
+ 
+ 	list_for_each_entry(use, &b->source_list, source_list) {
+ 		if (use->source == a) {
+ 			pr_debug("%s uses %s!\n", a->name, b->name);
+ 			return 1;
+ 		}
+ 	}
+ 	pr_debug("%s does not use %s!\n", a->name, b->name);
+ 	return 0;
+ }
+ 
+ /*
+  * Module a uses b
+  *  - we add 'a' as a "source", 'b' as a "target" of module use
+  *  - the module_use is added to the list of 'b' sources (so
+  *    'b' can walk the list to see who sourced them), and of 'a'
+  *    targets (so 'a' can see what modules it targets).
+  */
+ static int add_module_usage(struct module *a, struct module *b)
+ {
+ 	struct module_use *use;
+ 
+ 	pr_debug("Allocating new usage for %s.\n", a->name);
+ 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
+ 	if (!use)
+ 		return -ENOMEM;
+ 
+ 	use->source = a;
+ 	use->target = b;
+ 	list_add(&use->source_list, &b->source_list);
+ 	list_add(&use->target_list, &a->target_list);
+ 	return 0;
+ }
+ 
+ /* Module a uses b: caller needs module_mutex() */
+ int ref_module(struct module *a, struct module *b)
+ {
+ 	int err;
+ 
+ 	if (b == NULL || already_uses(a, b))
+ 		return 0;
+ 
+ 	/* If module isn't available, we fail. */
+ 	err = strong_try_module_get(b);
+ 	if (err)
+ 		return err;
+ 
+ 	err = add_module_usage(a, b);
+ 	if (err) {
+ 		module_put(b);
+ 		return err;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(ref_module);
+ 
+ /* Clear the unload stuff of the module. */
+ static void module_unload_free(struct module *mod)
+ {
+ 	struct module_use *use, *tmp;
+ 
+ 	mutex_lock(&module_mutex);
+ 	list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
+ 		struct module *i = use->target;
+ 		pr_debug("%s unusing %s\n", mod->name, i->name);
+ 		module_put(i);
+ 		list_del(&use->source_list);
+ 		list_del(&use->target_list);
+ 		kfree(use);
+ 	}
+ 	mutex_unlock(&module_mutex);
+ }
+ 
+ #ifdef CONFIG_MODULE_FORCE_UNLOAD
+ static inline int try_force_unload(unsigned int flags)
+ {
+ 	int ret = (flags & O_TRUNC);
+ 	if (ret)
+ 		add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
+ 	return ret;
+ }
+ #else
+ static inline int try_force_unload(unsigned int flags)
+ {
+ 	return 0;
+ }
+ #endif /* CONFIG_MODULE_FORCE_UNLOAD */
+ 
+ /* Try to release refcount of module, 0 means success. */
+ static int try_release_module_ref(struct module *mod)
+ {
+ 	int ret;
+ 
+ 	/* Try to decrement refcnt which we set at loading */
+ 	ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ 	BUG_ON(ret < 0);
+ 	if (ret)
+ 		/* Someone can put this right now, recover with checking */
+ 		ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+ 
+ 	return ret;
+ }
+ 
+ static int try_stop_module(struct module *mod, int flags, int *forced)
+ {
+ 	/* If it's not unused, quit unless we're forcing. */
+ 	if (try_release_module_ref(mod) != 0) {
+ 		*forced = try_force_unload(flags);
+ 		if (!(*forced))
+ 			return -EWOULDBLOCK;
+ 	}
+ 
+ 	/* Mark it as dying. */
+ 	mod->state = MODULE_STATE_GOING;
+ 
+ 	return 0;
+ }
+ 
+ /**
+  * module_refcount - return the refcount or -1 if unloading
+  *
+  * @mod:	the module we're checking
+  *
+  * Returns:
+  *	-1 if the module is in the process of unloading
+  *	otherwise the number of references in the kernel to the module
+  */
+ int module_refcount(struct module *mod)
+ {
+ 	return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+ }
+ EXPORT_SYMBOL(module_refcount);
+ 
+ /* This exists whether we can unload or not */
+ static void free_module(struct module *mod);
+ 
+ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+ 		unsigned int, flags)
+ {
+ 	struct module *mod;
+ 	char name[MODULE_NAME_LEN];
+ 	int ret, forced = 0;
+ 
+ 	if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ 		return -EPERM;
+ 
+ 	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
+ 		return -EFAULT;
+ 	name[MODULE_NAME_LEN-1] = '\0';
+ 
+ 	audit_log_kern_module(name);
+ 
+ 	if (mutex_lock_interruptible(&module_mutex) != 0)
+ 		return -EINTR;
+ 
+ 	mod = find_module(name);
+ 	if (!mod) {
+ 		ret = -ENOENT;
+ 		goto out;
+ 	}
+ 
+ 	if (!list_empty(&mod->source_list)) {
+ 		/* Other modules depend on us: get rid of them first. */
+ 		ret = -EWOULDBLOCK;
+ 		goto out;
+ 	}
+ 
+ 	/* Doing init or already dying? */
+ 	if (mod->state != MODULE_STATE_LIVE) {
+ 		/* FIXME: if (force), slam module count damn the torpedoes */
+ 		pr_debug("%s already dying\n", mod->name);
+ 		ret = -EBUSY;
+ 		goto out;
+ 	}
+ 
+ 	/* If it has an init func, it must have an exit func to unload */
+ 	if (mod->init && !mod->exit) {
+ 		forced = try_force_unload(flags);
+ 		if (!forced) {
+ 			/* This module can't be removed */
+ 			ret = -EBUSY;
+ 			goto out;
+ 		}
+ 	}
+ 
+ 	/* Stop the machine so refcounts can't move and disable module. */
+ 	ret = try_stop_module(mod, flags, &forced);
+ 	if (ret != 0)
+ 		goto out;
+ 
+ 	mutex_unlock(&module_mutex);
+ 	/* Final destruction now no one is using it. */
+ 	if (mod->exit != NULL)
+ 		mod->exit();
+ 	blocking_notifier_call_chain(&module_notify_list,
+ 				     MODULE_STATE_GOING, mod);
+ 	klp_module_going(mod);
+ 	ftrace_release_mod(mod);
+ 
+ 	async_synchronize_full();
+ 
+ 	/* Store the name of the last unloaded module for diagnostic purposes */
+ 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
+ 
+ 	free_module(mod);
+ 	/* someone could wait for the module in add_unformed_module() */
+ 	wake_up_all(&module_wq);
+ 	return 0;
+ out:
+ 	mutex_unlock(&module_mutex);
+ 	return ret;
+ }
+ 
+ static inline void print_unload_info(struct seq_file *m, struct module *mod)
+ {
+ 	struct module_use *use;
+ 	int printed_something = 0;
+ 
+ 	seq_printf(m, " %i ", module_refcount(mod));
+ 
+ 	/*
+ 	 * Always include a trailing , so userspace can differentiate
+ 	 * between this and the old multi-field proc format.
+ 	 */
+ 	list_for_each_entry(use, &mod->source_list, source_list) {
+ 		printed_something = 1;
+ 		seq_printf(m, "%s,", use->source->name);
+ 	}
+ 
+ 	if (mod->init != NULL && mod->exit == NULL) {
+ 		printed_something = 1;
+ 		seq_puts(m, "[permanent],");
+ 	}
+ 
+ 	if (!printed_something)
+ 		seq_puts(m, "-");
+ }
+ 
+ void __symbol_put(const char *symbol)
+ {
+ 	struct module *owner;
+ 
+ 	preempt_disable();
+ 	if (!find_symbol(symbol, &owner, NULL, true, false))
+ 		BUG();
+ 	module_put(owner);
+ 	preempt_enable();
+ }
+ EXPORT_SYMBOL(__symbol_put);
+ 
+ /* Note this assumes addr is a function, which it currently always is. */
+ void symbol_put_addr(void *addr)
+ {
+ 	struct module *modaddr;
+ 	unsigned long a = (unsigned long)dereference_function_descriptor(addr);
+ 
+ 	if (core_kernel_text(a))
+ 		return;
+ 
+ 	/*
+ 	 * Even though we hold a reference on the module; we still need to
+ 	 * disable preemption in order to safely traverse the data structure.
+ 	 */
+ 	preempt_disable();
+ 	modaddr = __module_text_address(a);
+ 	BUG_ON(!modaddr);
+ 	module_put(modaddr);
+ 	preempt_enable();
+ }
+ EXPORT_SYMBOL_GPL(symbol_put_addr);
+ 
+ static ssize_t show_refcnt(struct module_attribute *mattr,
+ 			   struct module_kobject *mk, char *buffer)
+ {
+ 	return sprintf(buffer, "%i\n", module_refcount(mk->mod));
+ }
+ 
+ static struct module_attribute modinfo_refcnt =
+ 	__ATTR(refcnt, 0444, show_refcnt, NULL);
+ 
+ void __module_get(struct module *module)
+ {
+ 	if (module) {
+ 		preempt_disable();
+ 		atomic_inc(&module->refcnt);
+ 		trace_module_get(module, _RET_IP_);
+ 		preempt_enable();
+ 	}
+ }
+ EXPORT_SYMBOL(__module_get);
+ 
+ bool try_module_get(struct module *module)
+ {
+ 	bool ret = true;
+ 
+ 	if (module) {
+ 		preempt_disable();
+ 		/* Note: here, we can fail to get a reference */
+ 		if (likely(module_is_live(module) &&
+ 			   atomic_inc_not_zero(&module->refcnt) != 0))
+ 			trace_module_get(module, _RET_IP_);
+ 		else
+ 			ret = false;
+ 
+ 		preempt_enable();
+ 	}
+ 	return ret;
+ }
+ EXPORT_SYMBOL(try_module_get);
+ 
+ void module_put(struct module *module)
+ {
+ 	int ret;
+ 
+ 	if (module) {
+ 		preempt_disable();
+ 		ret = atomic_dec_if_positive(&module->refcnt);
+ 		WARN_ON(ret < 0);	/* Failed to put refcount */
+ 		trace_module_put(module, _RET_IP_);
+ 		preempt_enable();
+ 	}
+ }
+ EXPORT_SYMBOL(module_put);
+ 
+ #else /* !CONFIG_MODULE_UNLOAD */
+ static inline void print_unload_info(struct seq_file *m, struct module *mod)
+ {
+ 	/* We don't know the usage count, or what modules are using. */
+ 	seq_puts(m, " - -");
+ }
+ 
+ static inline void module_unload_free(struct module *mod)
+ {
+ }
+ 
+ int ref_module(struct module *a, struct module *b)
+ {
+ 	return strong_try_module_get(b);
+ }
+ EXPORT_SYMBOL_GPL(ref_module);
+ 
+ static inline int module_unload_init(struct module *mod)
+ {
+ 	return 0;
+ }
+ #endif /* CONFIG_MODULE_UNLOAD */
+ 
+ static size_t module_flags_taint(struct module *mod, char *buf)
+ {
+ 	size_t l = 0;
+ 	int i;
+ 
+ 	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+ 		if (taint_flags[i].module && test_bit(i, &mod->taints))
+ 			buf[l++] = taint_flags[i].c_true;
+ 	}
+ 
+ 	return l;
+ }
+ 
+ static ssize_t show_initstate(struct module_attribute *mattr,
+ 			      struct module_kobject *mk, char *buffer)
+ {
+ 	const char *state = "unknown";
+ 
+ 	switch (mk->mod->state) {
+ 	case MODULE_STATE_LIVE:
+ 		state = "live";
+ 		break;
+ 	case MODULE_STATE_COMING:
+ 		state = "coming";
+ 		break;
+ 	case MODULE_STATE_GOING:
+ 		state = "going";
+ 		break;
+ 	default:
+ 		BUG();
+ 	}
+ 	return sprintf(buffer, "%s\n", state);
+ }
+ 
+ static struct module_attribute modinfo_initstate =
+ 	__ATTR(initstate, 0444, show_initstate, NULL);
+ 
+ static ssize_t store_uevent(struct module_attribute *mattr,
+ 			    struct module_kobject *mk,
+ 			    const char *buffer, size_t count)
+ {
+ 	int rc;
+ 
+ 	rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ 	return rc ? rc : count;
+ }
+ 
+ struct module_attribute module_uevent =
+ 	__ATTR(uevent, 0200, NULL, store_uevent);
+ 
+ static ssize_t show_coresize(struct module_attribute *mattr,
+ 			     struct module_kobject *mk, char *buffer)
+ {
+ 	return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
+ }
+ 
+ static struct module_attribute modinfo_coresize =
+ 	__ATTR(coresize, 0444, show_coresize, NULL);
+ 
+ static ssize_t show_initsize(struct module_attribute *mattr,
+ 			     struct module_kobject *mk, char *buffer)
+ {
+ 	return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
+ }
+ 
+ static struct module_attribute modinfo_initsize =
+ 	__ATTR(initsize, 0444, show_initsize, NULL);
+ 
+ static ssize_t show_taint(struct module_attribute *mattr,
+ 			  struct module_kobject *mk, char *buffer)
+ {
+ 	size_t l;
+ 
+ 	l = module_flags_taint(mk->mod, buffer);
+ 	buffer[l++] = '\n';
+ 	return l;
+ }
+ 
+ static struct module_attribute modinfo_taint =
+ 	__ATTR(taint, 0444, show_taint, NULL);
+ 
+ static struct module_attribute *modinfo_attrs[] = {
+ 	&module_uevent,
+ 	&modinfo_version,
+ 	&modinfo_srcversion,
+ 	&modinfo_initstate,
+ 	&modinfo_coresize,
+ 	&modinfo_initsize,
+ 	&modinfo_taint,
+ #ifdef CONFIG_MODULE_UNLOAD
+ 	&modinfo_refcnt,
+ #endif
+ 	NULL,
+ };
+ 
+ static const char vermagic[] = VERMAGIC_STRING;
+ 
+ static int try_to_force_load(struct module *mod, const char *reason)
+ {
+ #ifdef CONFIG_MODULE_FORCE_LOAD
+ 	if (!test_taint(TAINT_FORCED_MODULE))
+ 		pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
+ 	add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
+ 	return 0;
+ #else
+ 	return -ENOEXEC;
+ #endif
+ }
+ 
+ #ifdef CONFIG_MODVERSIONS
+ 
+ static u32 resolve_rel_crc(const s32 *crc)
+ {
+ 	return *(u32 *)((void *)crc + *crc);
+ }
+ 
+ static int check_version(const struct load_info *info,
+ 			 const char *symname,
+ 			 struct module *mod,
+ 			 const s32 *crc)
+ {
+ 	Elf_Shdr *sechdrs = info->sechdrs;
+ 	unsigned int versindex = info->index.vers;
+ 	unsigned int i, num_versions;
+ 	struct modversion_info *versions;
+ 
+ 	/* Exporting module didn't supply crcs?  OK, we're already tainted. */
+ 	if (!crc)
+ 		return 1;
+ 
+ 	/* No versions at all?  modprobe --force does this. */
+ 	if (versindex == 0)
+ 		return try_to_force_load(mod, symname) == 0;
+ 
+ 	versions = (void *) sechdrs[versindex].sh_addr;
+ 	num_versions = sechdrs[versindex].sh_size
+ 		/ sizeof(struct modversion_info);
+ 
+ 	for (i = 0; i < num_versions; i++) {
+ 		u32 crcval;
+ 
+ 		if (strcmp(versions[i].name, symname) != 0)
+ 			continue;
+ 
+ 		if (IS_ENABLED(CONFIG_MODULE_REL_CRCS))
+ 			crcval = resolve_rel_crc(crc);
+ 		else
+ 			crcval = *crc;
+ 		if (versions[i].crc == crcval)
+ 			return 1;
+ 		pr_debug("Found checksum %X vs module %lX\n",
+ 			 crcval, versions[i].crc);
+ 		goto bad_version;
+ 	}
+ 
+ 	/* Broken toolchain. Warn once, then let it go.. */
+ 	pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
+ 	return 1;
+ 
+ bad_version:
+ 	pr_warn("%s: disagrees about version of symbol %s\n",
+ 	       info->name, symname);
+ 	return 0;
+ }
+ 
+ static inline int check_modstruct_version(const struct load_info *info,
+ 					  struct module *mod)
+ {
+ 	const s32 *crc;
+ 
+ 	/*
+ 	 * Since this should be found in kernel (which can't be removed), no
+ 	 * locking is necessary -- use preempt_disable() to placate lockdep.
+ 	 */
+ 	preempt_disable();
+ 	if (!find_symbol("module_layout", NULL, &crc, true, false)) {
+ 		preempt_enable();
+ 		BUG();
+ 	}
+ 	preempt_enable();
+ 	return check_version(info, "module_layout", mod, crc);
+ }
+ 
+ /* First part is kernel version, which we ignore if module has crcs. */
+ static inline int same_magic(const char *amagic, const char *bmagic,
+ 			     bool has_crcs)
+ {
+ 	if (has_crcs) {
+ 		amagic += strcspn(amagic, " ");
+ 		bmagic += strcspn(bmagic, " ");
+ 	}
+ 	return strcmp(amagic, bmagic) == 0;
+ }
+ #else
+ static inline int check_version(const struct load_info *info,
+ 				const char *symname,
+ 				struct module *mod,
+ 				const s32 *crc)
+ {
+ 	return 1;
+ }
+ 
+ static inline int check_modstruct_version(const struct load_info *info,
+ 					  struct module *mod)
+ {
+ 	return 1;
+ }
+ 
+ static inline int same_magic(const char *amagic, const char *bmagic,
+ 			     bool has_crcs)
+ {
+ 	return strcmp(amagic, bmagic) == 0;
+ }
+ #endif /* CONFIG_MODVERSIONS */
+ 
+ static char *get_modinfo(const struct load_info *info, const char *tag);
+ static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ 			      char *prev);
+ 
+ static int verify_namespace_is_imported(const struct load_info *info,
+ 					const struct kernel_symbol *sym,
+ 					struct module *mod)
+ {
+ 	const char *namespace;
+ 	char *imported_namespace;
+ 
+ 	namespace = kernel_symbol_namespace(sym);
+ 	if (namespace) {
+ 		imported_namespace = get_modinfo(info, "import_ns");
+ 		while (imported_namespace) {
+ 			if (strcmp(namespace, imported_namespace) == 0)
+ 				return 0;
+ 			imported_namespace = get_next_modinfo(
+ 				info, "import_ns", imported_namespace);
+ 		}
+ #ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ 		pr_warn(
+ #else
+ 		pr_err(
+ #endif
+ 			"%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ 			mod->name, kernel_symbol_name(sym), namespace);
+ #ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ 		return -EINVAL;
+ #endif
+ 	}
+ 	return 0;
+ }
+ 
+ 
+ /* Resolve a symbol for this module.  I.e. if we find one, record usage. */
+ static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ 						  const struct load_info *info,
+ 						  const char *name,
+ 						  char ownername[])
+ {
+ 	struct module *owner;
+ 	const struct kernel_symbol *sym;
+ 	const s32 *crc;
+ 	int err;
+ 
+ 	/*
+ 	 * The module_mutex should not be a heavily contended lock;
+ 	 * if we get the occasional sleep here, we'll go an extra iteration
+ 	 * in the wait_event_interruptible(), which is harmless.
+ 	 */
+ 	sched_annotate_sleep();
+ 	mutex_lock(&module_mutex);
+ 	sym = find_symbol(name, &owner, &crc,
+ 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
+ 	if (!sym)
+ 		goto unlock;
+ 
+ 	if (!check_version(info, name, mod, crc)) {
+ 		sym = ERR_PTR(-EINVAL);
+ 		goto getname;
+ 	}
+ 
+ 	err = verify_namespace_is_imported(info, sym, mod);
+ 	if (err) {
+ 		sym = ERR_PTR(err);
+ 		goto getname;
+ 	}
+ 
+ 	err = ref_module(mod, owner);
+ 	if (err) {
+ 		sym = ERR_PTR(err);
+ 		goto getname;
+ 	}
+ 
+ getname:
+ 	/* We must make copy under the lock if we failed to get ref. */
+ 	strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
+ unlock:
+ 	mutex_unlock(&module_mutex);
+ 	return sym;
+ }
+ 
+ static const struct kernel_symbol *
+ resolve_symbol_wait(struct module *mod,
+ 		    const struct load_info *info,
+ 		    const char *name)
+ {
+ 	const struct kernel_symbol *ksym;
+ 	char owner[MODULE_NAME_LEN];
+ 
+ 	if (wait_event_interruptible_timeout(module_wq,
+ 			!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ 			|| PTR_ERR(ksym) != -EBUSY,
+ 					     30 * HZ) <= 0) {
+ 		pr_warn("%s: gave up waiting for init of module %s.\n",
+ 			mod->name, owner);
+ 	}
+ 	return ksym;
+ }
+ 
+ /*
+  * /sys/module/foo/sections stuff
+  * J. Corbet <corbet@lwn.net>
+  */
+ #ifdef CONFIG_SYSFS
+ 
+ #ifdef CONFIG_KALLSYMS
+ static inline bool sect_empty(const Elf_Shdr *sect)
+ {
+ 	return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
+ }
+ 
+ struct module_sect_attr {
+ 	struct bin_attribute battr;
+ 	unsigned long address;
+ };
+ 
+ struct module_sect_attrs {
+ 	struct attribute_group grp;
+ 	unsigned int nsections;
+ 	struct module_sect_attr attrs[0];
+ };
+ 
+ #define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
+ static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
+ 				struct bin_attribute *battr,
+ 				char *buf, loff_t pos, size_t count)
+ {
+ 	struct module_sect_attr *sattr =
+ 		container_of(battr, struct module_sect_attr, battr);
+ 	char bounce[MODULE_SECT_READ_SIZE + 1];
+ 	size_t wrote;
+ 
+ 	if (pos != 0)
+ 		return -EINVAL;
+ 
+ 	/*
+ 	 * Since we're a binary read handler, we must account for the
+ 	 * trailing NUL byte that sprintf will write: if "buf" is
+ 	 * too small to hold the NUL, or the NUL is exactly the last
+ 	 * byte, the read will look like it got truncated by one byte.
+ 	 * Since there is no way to ask sprintf nicely to not write
+ 	 * the NUL, we have to use a bounce buffer.
+ 	 */
+ 	wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
+ 			 kallsyms_show_value(file->f_cred)
+ 				? (void *)sattr->address : NULL);
+ 	count = min(count, wrote);
+ 	memcpy(buf, bounce, count);
+ 
+ 	return count;
+ }
+ 
+ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+ {
+ 	unsigned int section;
+ 
+ 	for (section = 0; section < sect_attrs->nsections; section++)
+ 		kfree(sect_attrs->attrs[section].battr.attr.name);
+ 	kfree(sect_attrs);
+ }
+ 
+ static void add_sect_attrs(struct module *mod, const struct load_info *info)
+ {
+ 	unsigned int nloaded = 0, i, size[2];
+ 	struct module_sect_attrs *sect_attrs;
+ 	struct module_sect_attr *sattr;
+ 	struct bin_attribute **gattr;
+ 
+ 	/* Count loaded sections and allocate structures */
+ 	for (i = 0; i < info->hdr->e_shnum; i++)
+ 		if (!sect_empty(&info->sechdrs[i]))
+ 			nloaded++;
+ 	size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
+ 			sizeof(sect_attrs->grp.bin_attrs[0]));
+ 	size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
+ 	sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ 	if (sect_attrs == NULL)
+ 		return;
+ 
+ 	/* Setup section attributes. */
+ 	sect_attrs->grp.name = "sections";
+ 	sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+ 
+ 	sect_attrs->nsections = 0;
+ 	sattr = &sect_attrs->attrs[0];
+ 	gattr = &sect_attrs->grp.bin_attrs[0];
+ 	for (i = 0; i < info->hdr->e_shnum; i++) {
+ 		Elf_Shdr *sec = &info->sechdrs[i];
+ 		if (sect_empty(sec))
+ 			continue;
+ 		sysfs_bin_attr_init(&sattr->battr);
+ 		sattr->address = sec->sh_addr;
+ 		sattr->battr.attr.name =
+ 			kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
+ 		if (sattr->battr.attr.name == NULL)
+ 			goto out;
+ 		sect_attrs->nsections++;
+ 		sattr->battr.read = module_sect_read;
+ 		sattr->battr.size = MODULE_SECT_READ_SIZE;
+ 		sattr->battr.attr.mode = 0400;
+ 		*(gattr++) = &(sattr++)->battr;
+ 	}
+ 	*gattr = NULL;
+ 
+ 	if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ 		goto out;
+ 
+ 	mod->sect_attrs = sect_attrs;
+ 	return;
+   out:
+ 	free_sect_attrs(sect_attrs);
+ }
+ 
+ static void remove_sect_attrs(struct module *mod)
+ {
+ 	if (mod->sect_attrs) {
+ 		sysfs_remove_group(&mod->mkobj.kobj,
+ 				   &mod->sect_attrs->grp);
+ 		/* We are positive that no one is using any sect attrs
+ 		 * at this point.  Deallocate immediately. */
+ 		free_sect_attrs(mod->sect_attrs);
+ 		mod->sect_attrs = NULL;
+ 	}
+ }
+ 
+ /*
+  * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections.
+  */
+ 
+ struct module_notes_attrs {
+ 	struct kobject *dir;
+ 	unsigned int notes;
+ 	struct bin_attribute attrs[0];
+ };
+ 
+ static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
+ 				 struct bin_attribute *bin_attr,
+ 				 char *buf, loff_t pos, size_t count)
+ {
+ 	/*
+ 	 * The caller checked the pos and count against our size.
+ 	 */
+ 	memcpy(buf, bin_attr->private + pos, count);
+ 	return count;
+ }
+ 
+ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
+ 			     unsigned int i)
+ {
+ 	if (notes_attrs->dir) {
+ 		while (i-- > 0)
+ 			sysfs_remove_bin_file(notes_attrs->dir,
+ 					      &notes_attrs->attrs[i]);
+ 		kobject_put(notes_attrs->dir);
+ 	}
+ 	kfree(notes_attrs);
+ }
+ 
+ static void add_notes_attrs(struct module *mod, const struct load_info *info)
+ {
+ 	unsigned int notes, loaded, i;
+ 	struct module_notes_attrs *notes_attrs;
+ 	struct bin_attribute *nattr;
+ 
+ 	/* failed to create section attributes, so can't create notes */
+ 	if (!mod->sect_attrs)
+ 		return;
+ 
+ 	/* Count notes sections and allocate structures.  */
+ 	notes = 0;
+ 	for (i = 0; i < info->hdr->e_shnum; i++)
+ 		if (!sect_empty(&info->sechdrs[i]) &&
+ 		    (info->sechdrs[i].sh_type == SHT_NOTE))
+ 			++notes;
+ 
+ 	if (notes == 0)
+ 		return;
+ 
+ 	notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
+ 			      GFP_KERNEL);
+ 	if (notes_attrs == NULL)
+ 		return;
+ 
+ 	notes_attrs->notes = notes;
+ 	nattr = &notes_attrs->attrs[0];
+ 	for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ 		if (sect_empty(&info->sechdrs[i]))
+ 			continue;
+ 		if (info->sechdrs[i].sh_type == SHT_NOTE) {
+ 			sysfs_bin_attr_init(nattr);
+ 			nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+ 			nattr->attr.mode = S_IRUGO;
+ 			nattr->size = info->sechdrs[i].sh_size;
+ 			nattr->private = (void *) info->sechdrs[i].sh_addr;
+ 			nattr->read = module_notes_read;
+ 			++nattr;
+ 		}
+ 		++loaded;
+ 	}
+ 
+ 	notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
+ 	if (!notes_attrs->dir)
+ 		goto out;
+ 
+ 	for (i = 0; i < notes; ++i)
+ 		if (sysfs_create_bin_file(notes_attrs->dir,
+ 					  &notes_attrs->attrs[i]))
+ 			goto out;
+ 
+ 	mod->notes_attrs = notes_attrs;
+ 	return;
+ 
+   out:
+ 	free_notes_attrs(notes_attrs, i);
+ }
+ 
+ static void remove_notes_attrs(struct module *mod)
+ {
+ 	if (mod->notes_attrs)
+ 		free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+ }
+ 
+ #else
+ 
+ static inline void add_sect_attrs(struct module *mod,
+ 				  const struct load_info *info)
+ {
+ }
+ 
+ static inline void remove_sect_attrs(struct module *mod)
+ {
+ }
+ 
+ static inline void add_notes_attrs(struct module *mod,
+ 				   const struct load_info *info)
+ {
+ }
+ 
+ static inline void remove_notes_attrs(struct module *mod)
+ {
+ }
+ #endif /* CONFIG_KALLSYMS */
+ 
+ static void del_usage_links(struct module *mod)
+ {
+ #ifdef CONFIG_MODULE_UNLOAD
+ 	struct module_use *use;
+ 
+ 	mutex_lock(&module_mutex);
+ 	list_for_each_entry(use, &mod->target_list, target_list)
+ 		sysfs_remove_link(use->target->holders_dir, mod->name);
+ 	mutex_unlock(&module_mutex);
+ #endif
+ }
+ 
+ static int add_usage_links(struct module *mod)
+ {
+ 	int ret = 0;
+ #ifdef CONFIG_MODULE_UNLOAD
+ 	struct module_use *use;
+ 
+ 	mutex_lock(&module_mutex);
+ 	list_for_each_entry(use, &mod->target_list, target_list) {
+ 		ret = sysfs_create_link(use->target->holders_dir,
+ 					&mod->mkobj.kobj, mod->name);
+ 		if (ret)
+ 			break;
+ 	}
+ 	mutex_unlock(&module_mutex);
+ 	if (ret)
+ 		del_usage_links(mod);
+ #endif
+ 	return ret;
+ }
+ 
+ static void module_remove_modinfo_attrs(struct module *mod, int end);
+ 
+ static int module_add_modinfo_attrs(struct module *mod)
+ {
+ 	struct module_attribute *attr;
+ 	struct module_attribute *temp_attr;
+ 	int error = 0;
+ 	int i;
+ 
+ 	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+ 					(ARRAY_SIZE(modinfo_attrs) + 1)),
+ 					GFP_KERNEL);
+ 	if (!mod->modinfo_attrs)
+ 		return -ENOMEM;
+ 
+ 	temp_attr = mod->modinfo_attrs;
+ 	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ 		if (!attr->test || attr->test(mod)) {
+ 			memcpy(temp_attr, attr, sizeof(*temp_attr));
+ 			sysfs_attr_init(&temp_attr->attr);
+ 			error = sysfs_create_file(&mod->mkobj.kobj,
+ 					&temp_attr->attr);
+ 			if (error)
+ 				goto error_out;
+ 			++temp_attr;
+ 		}
+ 	}
+ 
+ 	return 0;
+ 
+ error_out:
+ 	if (i > 0)
+ 		module_remove_modinfo_attrs(mod, --i);
+ 	else
+ 		kfree(mod->modinfo_attrs);
+ 	return error;
+ }
+ 
+ static void module_remove_modinfo_attrs(struct module *mod, int end)
+ {
+ 	struct module_attribute *attr;
+ 	int i;
+ 
+ 	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ 		if (end >= 0 && i > end)
+ 			break;
+ 		/* pick a field to test for end of list */
+ 		if (!attr->attr.name)
+ 			break;
+ 		sysfs_remove_file(&mod->mkobj.kobj, &attr->attr);
+ 		if (attr->free)
+ 			attr->free(mod);
+ 	}
+ 	kfree(mod->modinfo_attrs);
+ }
+ 
+ static void mod_kobject_put(struct module *mod)
+ {
+ 	DECLARE_COMPLETION_ONSTACK(c);
+ 	mod->mkobj.kobj_completion = &c;
+ 	kobject_put(&mod->mkobj.kobj);
+ 	wait_for_completion(&c);
+ }
+ 
+ static int mod_sysfs_init(struct module *mod)
+ {
+ 	int err;
+ 	struct kobject *kobj;
+ 
+ 	if (!module_sysfs_initialized) {
+ 		pr_err("%s: module sysfs not initialized\n", mod->name);
+ 		err = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	kobj = kset_find_obj(module_kset, mod->name);
+ 	if (kobj) {
+ 		pr_err("%s: module is already loaded\n", mod->name);
+ 		kobject_put(kobj);
+ 		err = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	mod->mkobj.mod = mod;
+ 
+ 	memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
+ 	mod->mkobj.kobj.kset = module_kset;
+ 	err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
+ 				   "%s", mod->name);
+ 	if (err)
+ 		mod_kobject_put(mod);
+ 
+ out:
+ 	return err;
+ }
+ 
+ static int mod_sysfs_setup(struct module *mod,
+ 			   const struct load_info *info,
+ 			   struct kernel_param *kparam,
+ 			   unsigned int num_params)
+ {
+ 	int err;
+ 
+ 	err = mod_sysfs_init(mod);
+ 	if (err)
+ 		goto out;
+ 
+ 	mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
+ 	if (!mod->holders_dir) {
+ 		err = -ENOMEM;
+ 		goto out_unreg;
+ 	}
+ 
+ 	err = module_param_sysfs_setup(mod, kparam, num_params);
+ 	if (err)
+ 		goto out_unreg_holders;
+ 
+ 	err = module_add_modinfo_attrs(mod);
+ 	if (err)
+ 		goto out_unreg_param;
+ 
+ 	err = add_usage_links(mod);
+ 	if (err)
+ 		goto out_unreg_modinfo_attrs;
+ 
+ 	add_sect_attrs(mod, info);
+ 	add_notes_attrs(mod, info);
+ 
+ 	return 0;
+ 
+ out_unreg_modinfo_attrs:
+ 	module_remove_modinfo_attrs(mod, -1);
+ out_unreg_param:
+ 	module_param_sysfs_remove(mod);
+ out_unreg_holders:
+ 	kobject_put(mod->holders_dir);
+ out_unreg:
+ 	mod_kobject_put(mod);
+ out:
+ 	return err;
+ }
+ 
+ static void mod_sysfs_fini(struct module *mod)
+ {
+ 	remove_notes_attrs(mod);
+ 	remove_sect_attrs(mod);
+ 	mod_kobject_put(mod);
+ }
+ 
+ static void init_param_lock(struct module *mod)
+ {
+ 	mutex_init(&mod->param_lock);
+ }
+ #else /* !CONFIG_SYSFS */
+ 
+ static int mod_sysfs_setup(struct module *mod,
+ 			   const struct load_info *info,
+ 			   struct kernel_param *kparam,
+ 			   unsigned int num_params)
+ {
+ 	return 0;
+ }
+ 
+ static void mod_sysfs_fini(struct module *mod)
+ {
+ }
+ 
+ static void module_remove_modinfo_attrs(struct module *mod, int end)
+ {
+ }
+ 
+ static void del_usage_links(struct module *mod)
+ {
+ }
+ 
+ static void init_param_lock(struct module *mod)
+ {
+ }
+ #endif /* CONFIG_SYSFS */
+ 
+ static void mod_sysfs_teardown(struct module *mod)
+ {
+ 	del_usage_links(mod);
+ 	module_remove_modinfo_attrs(mod, -1);
+ 	module_param_sysfs_remove(mod);
+ 	kobject_put(mod->mkobj.drivers_dir);
+ 	kobject_put(mod->holders_dir);
+ 	mod_sysfs_fini(mod);
+ }
+ 
+ #ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
+ /*
+  * LKM RO/NX protection: protect module's text/ro-data
+  * from modification and any data from execution.
+  *
+  * General layout of module is:
+  *          [text] [read-only-data] [ro-after-init] [writable data]
+  * text_size -----^                ^               ^               ^
+  * ro_size ------------------------|               |               |
+  * ro_after_init_size -----------------------------|               |
+  * size -----------------------------------------------------------|
+  *
+  * These values are always page-aligned (as is base)
+  */
+ static void frob_text(const struct module_layout *layout,
+ 		      int (*set_memory)(unsigned long start, int num_pages))
+ {
+ 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ 	set_memory((unsigned long)layout->base,
+ 		   layout->text_size >> PAGE_SHIFT);
+ }
+ 
+ #ifdef CONFIG_STRICT_MODULE_RWX
+ static void frob_rodata(const struct module_layout *layout,
+ 			int (*set_memory)(unsigned long start, int num_pages))
+ {
+ 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ 	set_memory((unsigned long)layout->base + layout->text_size,
+ 		   (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
+ }
+ 
+ static void frob_ro_after_init(const struct module_layout *layout,
+ 				int (*set_memory)(unsigned long start, int num_pages))
+ {
+ 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+ 	set_memory((unsigned long)layout->base + layout->ro_size,
+ 		   (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
+ }
+ 
+ static void frob_writable_data(const struct module_layout *layout,
+ 			       int (*set_memory)(unsigned long start, int num_pages))
+ {
+ 	BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->ro_after_init_size & (PAGE_SIZE-1));
+ 	BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1));
+ 	set_memory((unsigned long)layout->base + layout->ro_after_init_size,
+ 		   (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
+ }
+ 
+ /* livepatching wants to disable read-only so it can frob module. */
+ void module_disable_ro(const struct module *mod)
+ {
+ 	if (!rodata_enabled)
+ 		return;
+ 
+ 	frob_text(&mod->core_layout, set_memory_rw);
+ 	frob_rodata(&mod->core_layout, set_memory_rw);
+ 	frob_ro_after_init(&mod->core_layout, set_memory_rw);
+ 	frob_text(&mod->init_layout, set_memory_rw);
+ 	frob_rodata(&mod->init_layout, set_memory_rw);
+ }
+ 
+ void module_enable_ro(const struct module *mod, bool after_init)
+ {
+ 	if (!rodata_enabled)
+ 		return;
+ 
+ 	set_vm_flush_reset_perms(mod->core_layout.base);
+ 	set_vm_flush_reset_perms(mod->init_layout.base);
+ 	frob_text(&mod->core_layout, set_memory_ro);
+ 
+ 	frob_rodata(&mod->core_layout, set_memory_ro);
+ 	frob_text(&mod->init_layout, set_memory_ro);
+ 	frob_rodata(&mod->init_layout, set_memory_ro);
+ 
+ 	if (after_init)
+ 		frob_ro_after_init(&mod->core_layout, set_memory_ro);
+ }
+ 
+ static void module_enable_nx(const struct module *mod)
+ {
+ 	frob_rodata(&mod->core_layout, set_memory_nx);
+ 	frob_ro_after_init(&mod->core_layout, set_memory_nx);
+ 	frob_writable_data(&mod->core_layout, set_memory_nx);
+ 	frob_rodata(&mod->init_layout, set_memory_nx);
+ 	frob_writable_data(&mod->init_layout, set_memory_nx);
+ }
+ 
+ /* Iterate through all modules and set each module's text as RW */
+ void set_all_modules_text_rw(void)
+ {
+ 	struct module *mod;
+ 
+ 	if (!rodata_enabled)
+ 		return;
+ 
+ 	mutex_lock(&module_mutex);
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 
+ 		frob_text(&mod->core_layout, set_memory_rw);
+ 		frob_text(&mod->init_layout, set_memory_rw);
+ 	}
+ 	mutex_unlock(&module_mutex);
+ }
+ 
+ /* Iterate through all modules and set each module's text as RO */
+ void set_all_modules_text_ro(void)
+ {
+ 	struct module *mod;
+ 
+ 	if (!rodata_enabled)
+ 		return;
+ 
+ 	mutex_lock(&module_mutex);
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		/*
+ 		 * Ignore going modules since it's possible that ro
+ 		 * protection has already been disabled, otherwise we'll
+ 		 * run into protection faults at module deallocation.
+ 		 */
+ 		if (mod->state == MODULE_STATE_UNFORMED ||
+ 			mod->state == MODULE_STATE_GOING)
+ 			continue;
+ 
+ 		frob_text(&mod->core_layout, set_memory_ro);
+ 		frob_text(&mod->init_layout, set_memory_ro);
+ 	}
+ 	mutex_unlock(&module_mutex);
+ }
+ #else /* !CONFIG_STRICT_MODULE_RWX */
+ static void module_enable_nx(const struct module *mod) { }
+ #endif /*  CONFIG_STRICT_MODULE_RWX */
+ static void module_enable_x(const struct module *mod)
+ {
+ 	frob_text(&mod->core_layout, set_memory_x);
+ 	frob_text(&mod->init_layout, set_memory_x);
+ }
+ #else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+ static void module_enable_nx(const struct module *mod) { }
+ static void module_enable_x(const struct module *mod) { }
+ #endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+ 
+ 
+ #ifdef CONFIG_LIVEPATCH
+ /*
+  * Persist Elf information about a module. Copy the Elf header,
+  * section header table, section string table, and symtab section
+  * index from info to mod->klp_info.
+  */
+ static int copy_module_elf(struct module *mod, struct load_info *info)
+ {
+ 	unsigned int size, symndx;
+ 	int ret;
+ 
+ 	size = sizeof(*mod->klp_info);
+ 	mod->klp_info = kmalloc(size, GFP_KERNEL);
+ 	if (mod->klp_info == NULL)
+ 		return -ENOMEM;
+ 
+ 	/* Elf header */
+ 	size = sizeof(mod->klp_info->hdr);
+ 	memcpy(&mod->klp_info->hdr, info->hdr, size);
+ 
+ 	/* Elf section header table */
+ 	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+ 	mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
+ 	if (mod->klp_info->sechdrs == NULL) {
+ 		ret = -ENOMEM;
+ 		goto free_info;
+ 	}
+ 
+ 	/* Elf section name string table */
+ 	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+ 	mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
+ 	if (mod->klp_info->secstrings == NULL) {
+ 		ret = -ENOMEM;
+ 		goto free_sechdrs;
+ 	}
+ 
+ 	/* Elf symbol section index */
+ 	symndx = info->index.sym;
+ 	mod->klp_info->symndx = symndx;
+ 
+ 	/*
+ 	 * For livepatch modules, core_kallsyms.symtab is a complete
+ 	 * copy of the original symbol table. Adjust sh_addr to point
+ 	 * to core_kallsyms.symtab since the copy of the symtab in module
+ 	 * init memory is freed at the end of do_init_module().
+ 	 */
+ 	mod->klp_info->sechdrs[symndx].sh_addr = \
+ 		(unsigned long) mod->core_kallsyms.symtab;
+ 
+ 	return 0;
+ 
+ free_sechdrs:
+ 	kfree(mod->klp_info->sechdrs);
+ free_info:
+ 	kfree(mod->klp_info);
+ 	return ret;
+ }
+ 
+ static void free_module_elf(struct module *mod)
+ {
+ 	kfree(mod->klp_info->sechdrs);
+ 	kfree(mod->klp_info->secstrings);
+ 	kfree(mod->klp_info);
+ }
+ #else /* !CONFIG_LIVEPATCH */
+ static int copy_module_elf(struct module *mod, struct load_info *info)
+ {
+ 	return 0;
+ }
+ 
+ static void free_module_elf(struct module *mod)
+ {
+ }
+ #endif /* CONFIG_LIVEPATCH */
+ 
+ void __weak module_memfree(void *module_region)
+ {
+ 	/*
+ 	 * This memory may be RO, and freeing RO memory in an interrupt is not
+ 	 * supported by vmalloc.
+ 	 */
+ 	WARN_ON(in_interrupt());
+ 	vfree(module_region);
+ }
+ 
+ void __weak module_arch_cleanup(struct module *mod)
+ {
+ }
+ 
+ void __weak module_arch_freeing_init(struct module *mod)
+ {
+ }
+ 
+ /* Free a module, remove from lists, etc. */
+ static void free_module(struct module *mod)
+ {
+ 	trace_module_free(mod);
+ 
+ 	mod_sysfs_teardown(mod);
+ 
+ 	/* We leave it in list to prevent duplicate loads, but make sure
+ 	 * that noone uses it while it's being deconstructed. */
+ 	mutex_lock(&module_mutex);
+ 	mod->state = MODULE_STATE_UNFORMED;
+ 	mutex_unlock(&module_mutex);
+ 
+ 	/* Remove dynamic debug info */
+ 	ddebug_remove_module(mod->name);
+ 
+ 	/* Arch-specific cleanup. */
+ 	module_arch_cleanup(mod);
+ 
+ 	/* Module unload stuff */
+ 	module_unload_free(mod);
+ 
+ 	/* Free any allocated parameters. */
+ 	destroy_params(mod->kp, mod->num_kp);
+ 
+ 	if (is_livepatch_module(mod))
+ 		free_module_elf(mod);
+ 
+ 	/* Now we can delete it from the lists */
+ 	mutex_lock(&module_mutex);
+ 	/* Unlink carefully: kallsyms could be walking list. */
+ 	list_del_rcu(&mod->list);
+ 	mod_tree_remove(mod);
+ 	/* Remove this module from bug list, this uses list_del_rcu */
+ 	module_bug_cleanup(mod);
+ 	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+ 	synchronize_rcu();
+ 	mutex_unlock(&module_mutex);
+ 
+ 	/* This may be empty, but that's OK */
+ 	module_arch_freeing_init(mod);
+ 	module_memfree(mod->init_layout.base);
+ 	kfree(mod->args);
+ 	percpu_modfree(mod);
+ 
+ 	/* Free lock-classes; relies on the preceding sync_rcu(). */
+ 	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+ 
+ 	/* Finally, free the core (containing the module structure) */
+ 	module_memfree(mod->core_layout.base);
+ }
+ 
+ void *__symbol_get(const char *symbol)
+ {
+ 	struct module *owner;
+ 	const struct kernel_symbol *sym;
+ 
+ 	preempt_disable();
+ 	sym = find_symbol(symbol, &owner, NULL, true, true);
+ 	if (sym && strong_try_module_get(owner))
+ 		sym = NULL;
+ 	preempt_enable();
+ 
+ 	return sym ? (void *)kernel_symbol_value(sym) : NULL;
+ }
+ EXPORT_SYMBOL_GPL(__symbol_get);
+ 
+ /*
+  * Ensure that an exported symbol [global namespace] does not already exist
+  * in the kernel or in some other module's exported symbol table.
+  *
+  * You must hold the module_mutex.
+  */
+ static int verify_exported_symbols(struct module *mod)
+ {
+ 	unsigned int i;
+ 	struct module *owner;
+ 	const struct kernel_symbol *s;
+ 	struct {
+ 		const struct kernel_symbol *sym;
+ 		unsigned int num;
+ 	} arr[] = {
+ 		{ mod->syms, mod->num_syms },
+ 		{ mod->gpl_syms, mod->num_gpl_syms },
+ 		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ 		{ mod->unused_syms, mod->num_unused_syms },
+ 		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+ #endif
+ 	};
+ 
+ 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
+ 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
+ 			if (find_symbol(kernel_symbol_name(s), &owner, NULL,
+ 					true, false)) {
+ 				pr_err("%s: exports duplicate symbol %s"
+ 				       " (owned by %s)\n",
+ 				       mod->name, kernel_symbol_name(s),
+ 				       module_name(owner));
+ 				return -ENOEXEC;
+ 			}
+ 		}
+ 	}
+ 	return 0;
+ }
+ 
+ static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+ {
+ 	/*
+ 	 * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ 	 * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ 	 * i386 has a similar problem but may not deserve a fix.
+ 	 *
+ 	 * If we ever have to ignore many symbols, consider refactoring the code to
+ 	 * only warn if referenced by a relocation.
+ 	 */
+ 	if (emachine == EM_386 || emachine == EM_X86_64)
+ 		return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ 	return false;
+ }
+ 
+ /* Change all symbols so that st_value encodes the pointer directly. */
+ static int simplify_symbols(struct module *mod, const struct load_info *info)
+ {
+ 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ 	Elf_Sym *sym = (void *)symsec->sh_addr;
+ 	unsigned long secbase;
+ 	unsigned int i;
+ 	int ret = 0;
+ 	const struct kernel_symbol *ksym;
+ 
+ 	for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ 		const char *name = info->strtab + sym[i].st_name;
+ 
+ 		switch (sym[i].st_shndx) {
+ 		case SHN_COMMON:
+ 			/* Ignore common symbols */
+ 			if (!strncmp(name, "__gnu_lto", 9))
+ 				break;
+ 
+ 			/* We compiled with -fno-common.  These are not
+ 			   supposed to happen.  */
+ 			pr_debug("Common symbol: %s\n", name);
+ 			pr_warn("%s: please compile with -fno-common\n",
+ 			       mod->name);
+ 			ret = -ENOEXEC;
+ 			break;
+ 
+ 		case SHN_ABS:
+ 			/* Don't need to do anything */
+ 			pr_debug("Absolute symbol: 0x%08lx\n",
+ 			       (long)sym[i].st_value);
+ 			break;
+ 
+ 		case SHN_LIVEPATCH:
+ 			/* Livepatch symbols are resolved by livepatch */
+ 			break;
+ 
+ 		case SHN_UNDEF:
+ 			ksym = resolve_symbol_wait(mod, info, name);
+ 			/* Ok if resolved.  */
+ 			if (ksym && !IS_ERR(ksym)) {
+ 				sym[i].st_value = kernel_symbol_value(ksym);
+ 				break;
+ 			}
+ 
+ 			/* Ok if weak or ignored.  */
+ 			if (!ksym &&
+ 			    (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ 			     ignore_undef_symbol(info->hdr->e_machine, name)))
+ 				break;
+ 
+ 			ret = PTR_ERR(ksym) ?: -ENOENT;
+ 			pr_warn("%s: Unknown symbol %s (err %d)\n",
+ 				mod->name, name, ret);
+ 			break;
+ 
+ 		default:
+ 			/* Divert to percpu allocation if a percpu var. */
+ 			if (sym[i].st_shndx == info->index.pcpu)
+ 				secbase = (unsigned long)mod_percpu(mod);
+ 			else
+ 				secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
+ 			sym[i].st_value += secbase;
+ 			break;
+ 		}
+ 	}
+ 
+ 	return ret;
+ }
+ 
+ static int apply_relocations(struct module *mod, const struct load_info *info)
+ {
+ 	unsigned int i;
+ 	int err = 0;
+ 
+ 	/* Now do relocations. */
+ 	for (i = 1; i < info->hdr->e_shnum; i++) {
+ 		unsigned int infosec = info->sechdrs[i].sh_info;
+ 
+ 		/* Not a valid relocation section? */
+ 		if (infosec >= info->hdr->e_shnum)
+ 			continue;
+ 
+ 		/* Don't bother with non-allocated sections */
+ 		if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ 			continue;
+ 
+ 		/* Livepatch relocation sections are applied by livepatch */
+ 		if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+ 			continue;
+ 
+ 		if (info->sechdrs[i].sh_type == SHT_REL)
+ 			err = apply_relocate(info->sechdrs, info->strtab,
+ 					     info->index.sym, i, mod);
+ 		else if (info->sechdrs[i].sh_type == SHT_RELA)
+ 			err = apply_relocate_add(info->sechdrs, info->strtab,
+ 						 info->index.sym, i, mod);
+ 		if (err < 0)
+ 			break;
+ 	}
+ 	return err;
+ }
+ 
+ /* Additional bytes needed by arch in front of individual sections */
+ unsigned int __weak arch_mod_section_prepend(struct module *mod,
+ 					     unsigned int section)
+ {
+ 	/* default implementation just returns zero */
+ 	return 0;
+ }
+ 
+ /* Update size with this section: return offset. */
+ static long get_offset(struct module *mod, unsigned int *size,
+ 		       Elf_Shdr *sechdr, unsigned int section)
+ {
+ 	long ret;
+ 
+ 	*size += arch_mod_section_prepend(mod, section);
+ 	ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
+ 	*size = ret + sechdr->sh_size;
+ 	return ret;
+ }
+ 
+ /* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+    might -- code, read-only data, read-write data, small data.  Tally
+    sizes, and place the offsets into sh_entsize fields: high bit means it
+    belongs in init. */
+ static void layout_sections(struct module *mod, struct load_info *info)
+ {
+ 	static unsigned long const masks[][2] = {
+ 		/* NOTE: all executable code must be the first section
+ 		 * in this array; otherwise modify the text_size
+ 		 * finder in the two loops below */
+ 		{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
+ 		{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
+ 		{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
+ 		{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
+ 		{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
+ 	};
+ 	unsigned int m, i;
+ 
+ 	for (i = 0; i < info->hdr->e_shnum; i++)
+ 		info->sechdrs[i].sh_entsize = ~0UL;
+ 
+ 	pr_debug("Core section allocation order:\n");
+ 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ 		for (i = 0; i < info->hdr->e_shnum; ++i) {
+ 			Elf_Shdr *s = &info->sechdrs[i];
+ 			const char *sname = info->secstrings + s->sh_name;
+ 
+ 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ 			    || (s->sh_flags & masks[m][1])
+ 			    || s->sh_entsize != ~0UL
+ 			    || strstarts(sname, ".init"))
+ 				continue;
+ 			s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i);
+ 			pr_debug("\t%s\n", sname);
+ 		}
+ 		switch (m) {
+ 		case 0: /* executable */
+ 			mod->core_layout.size = debug_align(mod->core_layout.size);
+ 			mod->core_layout.text_size = mod->core_layout.size;
+ 			break;
+ 		case 1: /* RO: text and ro-data */
+ 			mod->core_layout.size = debug_align(mod->core_layout.size);
+ 			mod->core_layout.ro_size = mod->core_layout.size;
+ 			break;
+ 		case 2: /* RO after init */
+ 			mod->core_layout.size = debug_align(mod->core_layout.size);
+ 			mod->core_layout.ro_after_init_size = mod->core_layout.size;
+ 			break;
+ 		case 4: /* whole core */
+ 			mod->core_layout.size = debug_align(mod->core_layout.size);
+ 			break;
+ 		}
+ 	}
+ 
+ 	pr_debug("Init section allocation order:\n");
+ 	for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ 		for (i = 0; i < info->hdr->e_shnum; ++i) {
+ 			Elf_Shdr *s = &info->sechdrs[i];
+ 			const char *sname = info->secstrings + s->sh_name;
+ 
+ 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
+ 			    || (s->sh_flags & masks[m][1])
+ 			    || s->sh_entsize != ~0UL
+ 			    || !strstarts(sname, ".init"))
+ 				continue;
+ 			s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i)
+ 					 | INIT_OFFSET_MASK);
+ 			pr_debug("\t%s\n", sname);
+ 		}
+ 		switch (m) {
+ 		case 0: /* executable */
+ 			mod->init_layout.size = debug_align(mod->init_layout.size);
+ 			mod->init_layout.text_size = mod->init_layout.size;
+ 			break;
+ 		case 1: /* RO: text and ro-data */
+ 			mod->init_layout.size = debug_align(mod->init_layout.size);
+ 			mod->init_layout.ro_size = mod->init_layout.size;
+ 			break;
+ 		case 2:
+ 			/*
+ 			 * RO after init doesn't apply to init_layout (only
+ 			 * core_layout), so it just takes the value of ro_size.
+ 			 */
+ 			mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
+ 			break;
+ 		case 4: /* whole init */
+ 			mod->init_layout.size = debug_align(mod->init_layout.size);
+ 			break;
+ 		}
+ 	}
+ }
+ 
+ static void set_license(struct module *mod, const char *license)
+ {
+ 	if (!license)
+ 		license = "unspecified";
+ 
+ 	if (!license_is_gpl_compatible(license)) {
+ 		if (!test_taint(TAINT_PROPRIETARY_MODULE))
+ 			pr_warn("%s: module license '%s' taints kernel.\n",
+ 				mod->name, license);
+ 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ 				 LOCKDEP_NOW_UNRELIABLE);
+ 	}
+ }
+ 
+ /* Parse tag=value strings from .modinfo section */
+ static char *next_string(char *string, unsigned long *secsize)
+ {
+ 	/* Skip non-zero chars */
+ 	while (string[0]) {
+ 		string++;
+ 		if ((*secsize)-- <= 1)
+ 			return NULL;
+ 	}
+ 
+ 	/* Skip any zero padding. */
+ 	while (!string[0]) {
+ 		string++;
+ 		if ((*secsize)-- <= 1)
+ 			return NULL;
+ 	}
+ 	return string;
+ }
+ 
+ static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ 			      char *prev)
+ {
+ 	char *p;
+ 	unsigned int taglen = strlen(tag);
+ 	Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ 	unsigned long size = infosec->sh_size;
+ 
+ 	/*
+ 	 * get_modinfo() calls made before rewrite_section_headers()
+ 	 * must use sh_offset, as sh_addr isn't set!
+ 	 */
+ 	char *modinfo = (char *)info->hdr + infosec->sh_offset;
+ 
+ 	if (prev) {
+ 		size -= prev - modinfo;
+ 		modinfo = next_string(prev, &size);
+ 	}
+ 
+ 	for (p = modinfo; p; p = next_string(p, &size)) {
+ 		if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ 			return p + taglen + 1;
+ 	}
+ 	return NULL;
+ }
+ 
+ static char *get_modinfo(const struct load_info *info, const char *tag)
+ {
+ 	return get_next_modinfo(info, tag, NULL);
+ }
+ 
+ static void setup_modinfo(struct module *mod, struct load_info *info)
+ {
+ 	struct module_attribute *attr;
+ 	int i;
+ 
+ 	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ 		if (attr->setup)
+ 			attr->setup(mod, get_modinfo(info, attr->attr.name));
+ 	}
+ }
+ 
+ static void free_modinfo(struct module *mod)
+ {
+ 	struct module_attribute *attr;
+ 	int i;
+ 
+ 	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ 		if (attr->free)
+ 			attr->free(mod);
+ 	}
+ }
+ 
+ #ifdef CONFIG_KALLSYMS
+ 
+ /* Lookup exported symbol in given range of kernel_symbols */
+ static const struct kernel_symbol *lookup_exported_symbol(const char *name,
+ 							  const struct kernel_symbol *start,
+ 							  const struct kernel_symbol *stop)
+ {
+ 	return bsearch(name, start, stop - start,
+ 			sizeof(struct kernel_symbol), cmp_name);
+ }
+ 
+ static int is_exported(const char *name, unsigned long value,
+ 		       const struct module *mod)
+ {
+ 	const struct kernel_symbol *ks;
+ 	if (!mod)
+ 		ks = lookup_exported_symbol(name, __start___ksymtab, __stop___ksymtab);
+ 	else
+ 		ks = lookup_exported_symbol(name, mod->syms, mod->syms + mod->num_syms);
+ 
+ 	return ks != NULL && kernel_symbol_value(ks) == value;
+ }
+ 
+ /* As per nm */
+ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
+ {
+ 	const Elf_Shdr *sechdrs = info->sechdrs;
+ 
+ 	if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+ 		if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
+ 			return 'v';
+ 		else
+ 			return 'w';
+ 	}
+ 	if (sym->st_shndx == SHN_UNDEF)
+ 		return 'U';
+ 	if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
+ 		return 'a';
+ 	if (sym->st_shndx >= SHN_LORESERVE)
+ 		return '?';
+ 	if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR)
+ 		return 't';
+ 	if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC
+ 	    && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) {
+ 		if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE))
+ 			return 'r';
+ 		else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ 			return 'g';
+ 		else
+ 			return 'd';
+ 	}
+ 	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ 		if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL)
+ 			return 's';
+ 		else
+ 			return 'b';
+ 	}
+ 	if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ 		      ".debug")) {
+ 		return 'n';
+ 	}
+ 	return '?';
+ }
+ 
+ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
+ 			unsigned int shnum, unsigned int pcpundx)
+ {
+ 	const Elf_Shdr *sec;
+ 
+ 	if (src->st_shndx == SHN_UNDEF
+ 	    || src->st_shndx >= shnum
+ 	    || !src->st_name)
+ 		return false;
+ 
+ #ifdef CONFIG_KALLSYMS_ALL
+ 	if (src->st_shndx == pcpundx)
+ 		return true;
+ #endif
+ 
+ 	sec = sechdrs + src->st_shndx;
+ 	if (!(sec->sh_flags & SHF_ALLOC)
+ #ifndef CONFIG_KALLSYMS_ALL
+ 	    || !(sec->sh_flags & SHF_EXECINSTR)
+ #endif
+ 	    || (sec->sh_entsize & INIT_OFFSET_MASK))
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ /*
+  * We only allocate and copy the strings needed by the parts of symtab
+  * we keep.  This is simple, but has the effect of making multiple
+  * copies of duplicates.  We could be more sophisticated, see
+  * linux-kernel thread starting with
+  * <73defb5e4bca04a6431392cc341112b1@localhost>.
+  */
+ static void layout_symtab(struct module *mod, struct load_info *info)
+ {
+ 	Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ 	Elf_Shdr *strsect = info->sechdrs + info->index.str;
+ 	const Elf_Sym *src;
+ 	unsigned int i, nsrc, ndst, strtab_size = 0;
+ 
+ 	/* Put symbol section at end of init part of module. */
+ 	symsect->sh_flags |= SHF_ALLOC;
+ 	symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect,
+ 					 info->index.sym) | INIT_OFFSET_MASK;
+ 	pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
+ 
+ 	src = (void *)info->hdr + symsect->sh_offset;
+ 	nsrc = symsect->sh_size / sizeof(*src);
+ 
+ 	/* Compute total space required for the core symbols' strtab. */
+ 	for (ndst = i = 0; i < nsrc; i++) {
+ 		if (i == 0 || is_livepatch_module(mod) ||
+ 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ 				   info->index.pcpu)) {
+ 			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
+ 			ndst++;
+ 		}
+ 	}
+ 
+ 	/* Append room for core symbols at end of core part. */
+ 	info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
+ 	info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
+ 	mod->core_layout.size += strtab_size;
+ 	info->core_typeoffs = mod->core_layout.size;
+ 	mod->core_layout.size += ndst * sizeof(char);
+ 	mod->core_layout.size = debug_align(mod->core_layout.size);
+ 
+ 	/* Put string table section at end of init part of module. */
+ 	strsect->sh_flags |= SHF_ALLOC;
+ 	strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
+ 					 info->index.str) | INIT_OFFSET_MASK;
+ 	pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+ 
+ 	/* We'll tack temporary mod_kallsyms on the end. */
+ 	mod->init_layout.size = ALIGN(mod->init_layout.size,
+ 				      __alignof__(struct mod_kallsyms));
+ 	info->mod_kallsyms_init_off = mod->init_layout.size;
+ 	mod->init_layout.size += sizeof(struct mod_kallsyms);
+ 	info->init_typeoffs = mod->init_layout.size;
+ 	mod->init_layout.size += nsrc * sizeof(char);
+ 	mod->init_layout.size = debug_align(mod->init_layout.size);
+ }
+ 
+ /*
+  * We use the full symtab and strtab which layout_symtab arranged to
+  * be appended to the init section.  Later we switch to the cut-down
+  * core-only ones.
+  */
+ static void add_kallsyms(struct module *mod, const struct load_info *info)
+ {
+ 	unsigned int i, ndst;
+ 	const Elf_Sym *src;
+ 	Elf_Sym *dst;
+ 	char *s;
+ 	Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ 
+ 	/* Set up to point into init section. */
+ 	mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+ 
+ 	mod->kallsyms->symtab = (void *)symsec->sh_addr;
+ 	mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ 	/* Make sure we get permanent strtab: don't use info->strtab. */
+ 	mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ 	mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
+ 
+ 	/*
+ 	 * Now populate the cut down core kallsyms for after init
+ 	 * and set types up while we still have access to sections.
+ 	 */
+ 	mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+ 	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+ 	mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
+ 	src = mod->kallsyms->symtab;
+ 	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
+ 		mod->kallsyms->typetab[i] = elf_type(src + i, info);
+ 		if (i == 0 || is_livepatch_module(mod) ||
+ 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ 				   info->index.pcpu)) {
+ 			mod->core_kallsyms.typetab[ndst] =
+ 			    mod->kallsyms->typetab[i];
+ 			dst[ndst] = src[i];
+ 			dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ 			s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
+ 				     KSYM_NAME_LEN) + 1;
+ 		}
+ 	}
+ 	mod->core_kallsyms.num_symtab = ndst;
+ }
+ #else
+ static inline void layout_symtab(struct module *mod, struct load_info *info)
+ {
+ }
+ 
+ static void add_kallsyms(struct module *mod, const struct load_info *info)
+ {
+ }
+ #endif /* CONFIG_KALLSYMS */
+ 
+ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num)
+ {
+ 	if (!debug)
+ 		return;
+ 	ddebug_add_module(debug, num, mod->name);
+ }
+ 
+ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
+ {
+ 	if (debug)
+ 		ddebug_remove_module(mod->name);
+ }
+ 
+ void * __weak module_alloc(unsigned long size)
+ {
+ 	return vmalloc_exec(size);
+ }
+ 
+ bool __weak module_exit_section(const char *name)
+ {
+ 	return strstarts(name, ".exit");
+ }
+ 
+ #ifdef CONFIG_DEBUG_KMEMLEAK
+ static void kmemleak_load_module(const struct module *mod,
+ 				 const struct load_info *info)
+ {
+ 	unsigned int i;
+ 
+ 	/* only scan the sections containing data */
+ 	kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
+ 
+ 	for (i = 1; i < info->hdr->e_shnum; i++) {
+ 		/* Scan all writable sections that's not executable */
+ 		if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ 		    !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ 		    (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
+ 			continue;
+ 
+ 		kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ 				   info->sechdrs[i].sh_size, GFP_KERNEL);
+ 	}
+ }
+ #else
+ static inline void kmemleak_load_module(const struct module *mod,
+ 					const struct load_info *info)
+ {
+ }
+ #endif
+ 
+ #ifdef CONFIG_MODULE_SIG
+ static int module_sig_check(struct load_info *info, int flags)
+ {
+ 	int err = -ENODATA;
+ 	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ 	const char *reason;
+ 	const void *mod = info->hdr;
+ 
+ 	/*
+ 	 * Require flags == 0, as a module with version information
+ 	 * removed is no longer the module that was signed
+ 	 */
+ 	if (flags == 0 &&
+ 	    info->len > markerlen &&
+ 	    memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ 		/* We truncate the module to discard the signature */
+ 		info->len -= markerlen;
+ 		err = mod_verify_sig(mod, info);
+ 	}
+ 
+ 	switch (err) {
+ 	case 0:
+ 		info->sig_ok = true;
+ 		return 0;
+ 
+ 		/* We don't permit modules to be loaded into trusted kernels
+ 		 * without a valid signature on them, but if we're not
+ 		 * enforcing, certain errors are non-fatal.
+ 		 */
+ 	case -ENODATA:
+ 		reason = "unsigned module";
+ 		break;
+ 	case -ENOPKG:
+ 		reason = "module with unsupported crypto";
+ 		break;
+ 	case -ENOKEY:
+ 		reason = "module with unavailable key";
+ 		break;
+ 
+ 		/* All other errors are fatal, including nomem, unparseable
+ 		 * signatures and signature check failures - even if signatures
+ 		 * aren't required.
+ 		 */
+ 	default:
+ 		return err;
+ 	}
+ 
+ 	if (is_module_sig_enforced()) {
+ 		pr_notice("Loading of %s is rejected\n", reason);
+ 		return -EKEYREJECTED;
+ 	}
+ 
+ 	return security_locked_down(LOCKDOWN_MODULE_SIGNATURE);
+ }
+ #else /* !CONFIG_MODULE_SIG */
+ static int module_sig_check(struct load_info *info, int flags)
+ {
+ 	return 0;
+ }
+ #endif /* !CONFIG_MODULE_SIG */
+ 
+ static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
+ {
+ 	unsigned long secend;
+ 
+ 	/*
+ 	 * Check for both overflow and offset/size being
+ 	 * too large.
+ 	 */
+ 	secend = shdr->sh_offset + shdr->sh_size;
+ 	if (secend < shdr->sh_offset || secend > info->len)
+ 		return -ENOEXEC;
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Sanity checks against invalid binaries, wrong arch, weird elf version.
+  *
+  * Also do basic validity checks against section offsets and sizes, the
+  * section name string table, and the indices used for it (sh_name).
+  */
+ static int elf_validity_check(struct load_info *info)
+ {
+ 	unsigned int i;
+ 	Elf_Shdr *shdr, *strhdr;
+ 	int err;
+ 
+ 	if (info->len < sizeof(*(info->hdr)))
+ 		return -ENOEXEC;
+ 
+ 	if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+ 	    || info->hdr->e_type != ET_REL
+ 	    || !elf_check_arch(info->hdr)
+ 	    || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+ 		return -ENOEXEC;
+ 
+ 	/*
+ 	 * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
+ 	 * known and small. So e_shnum * sizeof(Elf_Shdr)
+ 	 * will not overflow unsigned long on any platform.
+ 	 */
+ 	if (info->hdr->e_shoff >= info->len
+ 	    || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ 		info->len - info->hdr->e_shoff))
+ 		return -ENOEXEC;
+ 
+ 	info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+ 
+ 	/*
+ 	 * Verify if the section name table index is valid.
+ 	 */
+ 	if (info->hdr->e_shstrndx == SHN_UNDEF
+ 	    || info->hdr->e_shstrndx >= info->hdr->e_shnum)
+ 		return -ENOEXEC;
+ 
+ 	strhdr = &info->sechdrs[info->hdr->e_shstrndx];
+ 	err = validate_section_offset(info, strhdr);
+ 	if (err < 0)
+ 		return err;
+ 
+ 	/*
+ 	 * The section name table must be NUL-terminated, as required
+ 	 * by the spec. This makes strcmp and pr_* calls that access
+ 	 * strings in the section safe.
+ 	 */
+ 	info->secstrings = (void *)info->hdr + strhdr->sh_offset;
+ 	if (info->secstrings[strhdr->sh_size - 1] != '\0')
+ 		return -ENOEXEC;
+ 
+ 	/*
+ 	 * The code assumes that section 0 has a length of zero and
+ 	 * an addr of zero, so check for it.
+ 	 */
+ 	if (info->sechdrs[0].sh_type != SHT_NULL
+ 	    || info->sechdrs[0].sh_size != 0
+ 	    || info->sechdrs[0].sh_addr != 0)
+ 		return -ENOEXEC;
+ 
+ 	for (i = 1; i < info->hdr->e_shnum; i++) {
+ 		shdr = &info->sechdrs[i];
+ 		switch (shdr->sh_type) {
+ 		case SHT_NULL:
+ 		case SHT_NOBITS:
+ 			continue;
+ 		case SHT_SYMTAB:
+ 			if (shdr->sh_link == SHN_UNDEF
+ 			    || shdr->sh_link >= info->hdr->e_shnum)
+ 				return -ENOEXEC;
+ 			fallthrough;
+ 		default:
+ 			err = validate_section_offset(info, shdr);
+ 			if (err < 0) {
+ 				pr_err("Invalid ELF section in module (section %u type %u)\n",
+ 					i, shdr->sh_type);
+ 				return err;
+ 			}
+ 
+ 			if (shdr->sh_flags & SHF_ALLOC) {
+ 				if (shdr->sh_name >= strhdr->sh_size) {
+ 					pr_err("Invalid ELF section name in module (section %u type %u)\n",
+ 					       i, shdr->sh_type);
+ 					return -ENOEXEC;
+ 				}
+ 			}
+ 			break;
+ 		}
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ #define COPY_CHUNK_SIZE (16*PAGE_SIZE)
+ 
+ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
+ {
+ 	do {
+ 		unsigned long n = min(len, COPY_CHUNK_SIZE);
+ 
+ 		if (copy_from_user(dst, usrc, n) != 0)
+ 			return -EFAULT;
+ 		cond_resched();
+ 		dst += n;
+ 		usrc += n;
+ 		len -= n;
+ 	} while (len);
+ 	return 0;
+ }
+ 
+ #ifdef CONFIG_LIVEPATCH
+ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+ {
+ 	if (get_modinfo(info, "livepatch")) {
+ 		mod->klp = true;
+ 		add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ 		pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ 			       mod->name);
+ 	}
+ 
+ 	return 0;
+ }
+ #else /* !CONFIG_LIVEPATCH */
+ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
+ {
+ 	if (get_modinfo(info, "livepatch")) {
+ 		pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+ 		       mod->name);
+ 		return -ENOEXEC;
+ 	}
+ 
+ 	return 0;
+ }
+ #endif /* CONFIG_LIVEPATCH */
+ 
+ static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+ {
+ 	if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ 		return;
+ 
+ 	pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ 		mod->name);
+ }
+ 
+ /* Sets info->hdr and info->len. */
+ static int copy_module_from_user(const void __user *umod, unsigned long len,
+ 				  struct load_info *info)
+ {
+ 	int err;
+ 
+ 	info->len = len;
+ 	if (info->len < sizeof(*(info->hdr)))
+ 		return -ENOEXEC;
+ 
+ 	err = security_kernel_load_data(LOADING_MODULE);
+ 	if (err)
+ 		return err;
+ 
+ 	/* Suck in entire file: we'll want most of it. */
+ 	info->hdr = __vmalloc(info->len,
+ 			GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
+ 	if (!info->hdr)
+ 		return -ENOMEM;
+ 
+ 	if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
+ 		vfree(info->hdr);
+ 		return -EFAULT;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static void free_copy(struct load_info *info)
+ {
+ 	vfree(info->hdr);
+ }
+ 
+ static int rewrite_section_headers(struct load_info *info, int flags)
+ {
+ 	unsigned int i;
+ 
+ 	/* This should always be true, but let's be sure. */
+ 	info->sechdrs[0].sh_addr = 0;
+ 
+ 	for (i = 1; i < info->hdr->e_shnum; i++) {
+ 		Elf_Shdr *shdr = &info->sechdrs[i];
+ 
+ 		/* Mark all sections sh_addr with their address in the
+ 		   temporary image. */
+ 		shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
+ 
+ #ifndef CONFIG_MODULE_UNLOAD
+ 		/* Don't load .exit sections */
+ 		if (module_exit_section(info->secstrings+shdr->sh_name))
+ 			shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
+ #endif
+ 	}
+ 
+ 	/* Track but don't keep modinfo and version sections. */
+ 	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ 	info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Set up our basic convenience variables (pointers to section headers,
+  * search for module section index etc), and do some basic section
+  * verification.
+  *
+  * Set info->mod to the temporary copy of the module in info->hdr. The final one
+  * will be allocated in move_module().
+  */
+ static int setup_load_info(struct load_info *info, int flags)
+ {
+ 	unsigned int i;
+ 
+ 	/* Try to find a name early so we can log errors with a module name */
+ 	info->index.info = find_sec(info, ".modinfo");
+ 	if (info->index.info)
+ 		info->name = get_modinfo(info, "name");
+ 
+ 	/* Find internal symbols and strings. */
+ 	for (i = 1; i < info->hdr->e_shnum; i++) {
+ 		if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+ 			info->index.sym = i;
+ 			info->index.str = info->sechdrs[i].sh_link;
+ 			info->strtab = (char *)info->hdr
+ 				+ info->sechdrs[info->index.str].sh_offset;
+ 			break;
+ 		}
+ 	}
+ 
+ 	if (info->index.sym == 0) {
+ 		pr_warn("%s: module has no symbols (stripped?)\n",
+ 			info->name ?: "(missing .modinfo section or name field)");
+ 		return -ENOEXEC;
+ 	}
+ 
+ 	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+ 	if (!info->index.mod) {
+ 		pr_warn("%s: No module found in object\n",
+ 			info->name ?: "(missing .modinfo section or name field)");
+ 		return -ENOEXEC;
+ 	}
+ 	/* This is temporary: point mod into copy of data. */
+ 	info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
+ 
+ 	/*
+ 	 * If we didn't load the .modinfo 'name' field earlier, fall back to
+ 	 * on-disk struct mod 'name' field.
+ 	 */
+ 	if (!info->name)
+ 		info->name = info->mod->name;
+ 
+ 	if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ 		info->index.vers = 0; /* Pretend no __versions section! */
+ 	else
+ 		info->index.vers = find_sec(info, "__versions");
+ 
+ 	info->index.pcpu = find_pcpusec(info);
+ 
+ 	return 0;
+ }
+ 
+ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
+ {
+ 	const char *modmagic = get_modinfo(info, "vermagic");
+ 	int err;
+ 
+ 	if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ 		modmagic = NULL;
+ 
+ 	/* This is allowed: modprobe --force will invalidate it. */
+ 	if (!modmagic) {
+ 		err = try_to_force_load(mod, "bad vermagic");
+ 		if (err)
+ 			return err;
+ 	} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
+ 		pr_err("%s: version magic '%s' should be '%s'\n",
+ 		       info->name, modmagic, vermagic);
+ 		return -ENOEXEC;
+ 	}
+ 
+ 	if (!get_modinfo(info, "intree")) {
+ 		if (!test_taint(TAINT_OOT_MODULE))
+ 			pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ 				mod->name);
+ 		add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ 	}
+ 
+ 	check_modinfo_retpoline(mod, info);
+ 
+ 	if (get_modinfo(info, "staging")) {
+ 		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ 		pr_warn("%s: module is from the staging directory, the quality "
+ 			"is unknown, you have been warned.\n", mod->name);
+ 	}
+ 
+ 	err = check_modinfo_livepatch(mod, info);
+ 	if (err)
+ 		return err;
+ 
+ 	/* Set up license info based on the info section */
+ 	set_license(mod, get_modinfo(info, "license"));
+ 
+ 	return 0;
+ }
+ 
+ static int find_module_sections(struct module *mod, struct load_info *info)
+ {
+ 	mod->kp = section_objs(info, "__param",
+ 			       sizeof(*mod->kp), &mod->num_kp);
+ 	mod->syms = section_objs(info, "__ksymtab",
+ 				 sizeof(*mod->syms), &mod->num_syms);
+ 	mod->crcs = section_addr(info, "__kcrctab");
+ 	mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ 				     sizeof(*mod->gpl_syms),
+ 				     &mod->num_gpl_syms);
+ 	mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+ 	mod->gpl_future_syms = section_objs(info,
+ 					    "__ksymtab_gpl_future",
+ 					    sizeof(*mod->gpl_future_syms),
+ 					    &mod->num_gpl_future_syms);
+ 	mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
+ 
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ 	mod->unused_syms = section_objs(info, "__ksymtab_unused",
+ 					sizeof(*mod->unused_syms),
+ 					&mod->num_unused_syms);
+ 	mod->unused_crcs = section_addr(info, "__kcrctab_unused");
+ 	mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
+ 					    sizeof(*mod->unused_gpl_syms),
+ 					    &mod->num_unused_gpl_syms);
+ 	mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
+ #endif
+ #ifdef CONFIG_CONSTRUCTORS
+ 	mod->ctors = section_objs(info, ".ctors",
+ 				  sizeof(*mod->ctors), &mod->num_ctors);
+ 	if (!mod->ctors)
+ 		mod->ctors = section_objs(info, ".init_array",
+ 				sizeof(*mod->ctors), &mod->num_ctors);
+ 	else if (find_sec(info, ".init_array")) {
+ 		/*
+ 		 * This shouldn't happen with same compiler and binutils
+ 		 * building all parts of the module.
+ 		 */
+ 		pr_warn("%s: has both .ctors and .init_array.\n",
+ 		       mod->name);
+ 		return -EINVAL;
+ 	}
+ #endif
+ 
+ #ifdef CONFIG_TRACEPOINTS
+ 	mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ 					     sizeof(*mod->tracepoints_ptrs),
+ 					     &mod->num_tracepoints);
+ #endif
+ #ifdef CONFIG_TREE_SRCU
+ 	mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+ 					     sizeof(*mod->srcu_struct_ptrs),
+ 					     &mod->num_srcu_structs);
+ #endif
+ #ifdef CONFIG_BPF_EVENTS
+ 	mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
+ 					   sizeof(*mod->bpf_raw_events),
+ 					   &mod->num_bpf_raw_events);
+ #endif
+ #ifdef CONFIG_JUMP_LABEL
+ 	mod->jump_entries = section_objs(info, "__jump_table",
+ 					sizeof(*mod->jump_entries),
+ 					&mod->num_jump_entries);
+ #endif
+ #ifdef CONFIG_EVENT_TRACING
+ 	mod->trace_events = section_objs(info, "_ftrace_events",
+ 					 sizeof(*mod->trace_events),
+ 					 &mod->num_trace_events);
+ 	mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ 					sizeof(*mod->trace_evals),
+ 					&mod->num_trace_evals);
+ #endif
+ #ifdef CONFIG_TRACING
+ 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
+ 					 sizeof(*mod->trace_bprintk_fmt_start),
+ 					 &mod->num_trace_bprintk_fmt);
+ #endif
+ #ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ 	/* sechdrs[0].sh_size is always zero */
+ 	mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+ 					     sizeof(*mod->ftrace_callsites),
+ 					     &mod->num_ftrace_callsites);
+ #endif
+ #ifdef CONFIG_FUNCTION_ERROR_INJECTION
+ 	mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
+ 					    sizeof(*mod->ei_funcs),
+ 					    &mod->num_ei_funcs);
+ #endif
+ 	mod->extable = section_objs(info, "__ex_table",
+ 				    sizeof(*mod->extable), &mod->num_exentries);
+ 
+ 	if (section_addr(info, "__obsparm"))
+ 		pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
+ 
+ 	info->debug = section_objs(info, "__verbose",
+ 				   sizeof(*info->debug), &info->num_debug);
+ 
+ 	return 0;
+ }
+ 
+ static int move_module(struct module *mod, struct load_info *info)
+ {
+ 	int i;
+ 	void *ptr;
+ 
+ 	/* Do the allocs. */
+ 	ptr = module_alloc(mod->core_layout.size);
+ 	/*
+ 	 * The pointer to this block is stored in the module structure
+ 	 * which is inside the block. Just mark it as not being a
+ 	 * leak.
+ 	 */
+ 	kmemleak_not_leak(ptr);
+ 	if (!ptr)
+ 		return -ENOMEM;
+ 
+ 	memset(ptr, 0, mod->core_layout.size);
+ 	mod->core_layout.base = ptr;
+ 
+ 	if (mod->init_layout.size) {
+ 		ptr = module_alloc(mod->init_layout.size);
+ 		/*
+ 		 * The pointer to this block is stored in the module structure
+ 		 * which is inside the block. This block doesn't need to be
+ 		 * scanned as it contains data and code that will be freed
+ 		 * after the module is initialized.
+ 		 */
+ 		kmemleak_ignore(ptr);
+ 		if (!ptr) {
+ 			module_memfree(mod->core_layout.base);
+ 			return -ENOMEM;
+ 		}
+ 		memset(ptr, 0, mod->init_layout.size);
+ 		mod->init_layout.base = ptr;
+ 	} else
+ 		mod->init_layout.base = NULL;
+ 
+ 	/* Transfer each section which specifies SHF_ALLOC */
+ 	pr_debug("final section addresses:\n");
+ 	for (i = 0; i < info->hdr->e_shnum; i++) {
+ 		void *dest;
+ 		Elf_Shdr *shdr = &info->sechdrs[i];
+ 
+ 		if (!(shdr->sh_flags & SHF_ALLOC))
+ 			continue;
+ 
+ 		if (shdr->sh_entsize & INIT_OFFSET_MASK)
+ 			dest = mod->init_layout.base
+ 				+ (shdr->sh_entsize & ~INIT_OFFSET_MASK);
+ 		else
+ 			dest = mod->core_layout.base + shdr->sh_entsize;
+ 
+ 		if (shdr->sh_type != SHT_NOBITS)
+ 			memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
+ 		/* Update sh_addr to point to copy in image. */
+ 		shdr->sh_addr = (unsigned long)dest;
+ 		pr_debug("\t0x%lx %s\n",
+ 			 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static int check_module_license_and_versions(struct module *mod)
+ {
+ 	int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+ 
+ 	/*
+ 	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ 	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ 	 * using GPL-only symbols it needs.
+ 	 */
+ 	if (strcmp(mod->name, "ndiswrapper") == 0)
+ 		add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
+ 
+ 	/* driverloader was caught wrongly pretending to be under GPL */
+ 	if (strcmp(mod->name, "driverloader") == 0)
+ 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ 				 LOCKDEP_NOW_UNRELIABLE);
+ 
+ 	/* lve claims to be GPL but upstream won't provide source */
+ 	if (strcmp(mod->name, "lve") == 0)
+ 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ 				 LOCKDEP_NOW_UNRELIABLE);
+ 
+ 	if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ 		pr_warn("%s: module license taints kernel.\n", mod->name);
+ 
+ #ifdef CONFIG_MODVERSIONS
+ 	if ((mod->num_syms && !mod->crcs)
+ 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
+ 	    || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
+ #ifdef CONFIG_UNUSED_SYMBOLS
+ 	    || (mod->num_unused_syms && !mod->unused_crcs)
+ 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
+ #endif
+ 		) {
+ 		return try_to_force_load(mod,
+ 					 "no versions for exported symbols");
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ static void flush_module_icache(const struct module *mod)
+ {
+ 	mm_segment_t old_fs;
+ 
+ 	/* flush the icache in correct context */
+ 	old_fs = get_fs();
+ 	set_fs(KERNEL_DS);
+ 
+ 	/*
+ 	 * Flush the instruction cache, since we've played with text.
+ 	 * Do it before processing of module parameters, so the module
+ 	 * can provide parameter accessor functions of its own.
+ 	 */
+ 	if (mod->init_layout.base)
+ 		flush_icache_range((unsigned long)mod->init_layout.base,
+ 				   (unsigned long)mod->init_layout.base
+ 				   + mod->init_layout.size);
+ 	flush_icache_range((unsigned long)mod->core_layout.base,
+ 			   (unsigned long)mod->core_layout.base + mod->core_layout.size);
+ 
+ 	set_fs(old_fs);
+ }
+ 
+ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
+ 				     Elf_Shdr *sechdrs,
+ 				     char *secstrings,
+ 				     struct module *mod)
+ {
+ 	return 0;
+ }
+ 
+ /* module_blacklist is a comma-separated list of module names */
+ static char *module_blacklist;
+ static bool blacklisted(const char *module_name)
+ {
+ 	const char *p;
+ 	size_t len;
+ 
+ 	if (!module_blacklist)
+ 		return false;
+ 
+ 	for (p = module_blacklist; *p; p += len) {
+ 		len = strcspn(p, ",");
+ 		if (strlen(module_name) == len && !memcmp(module_name, p, len))
+ 			return true;
+ 		if (p[len] == ',')
+ 			len++;
+ 	}
+ 	return false;
+ }
+ core_param(module_blacklist, module_blacklist, charp, 0400);
+ 
+ static struct module *layout_and_allocate(struct load_info *info, int flags)
+ {
+ 	struct module *mod;
+ 	unsigned int ndx;
+ 	int err;
+ 
+ 	err = check_modinfo(info->mod, info, flags);
+ 	if (err)
+ 		return ERR_PTR(err);
+ 
+ 	/* Allow arches to frob section contents and sizes.  */
+ 	err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ 					info->secstrings, info->mod);
+ 	if (err < 0)
+ 		return ERR_PTR(err);
+ 
+ 	/* We will do a special allocation for per-cpu sections later. */
+ 	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ 
+ 	/*
+ 	 * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ 	 * layout_sections() can put it in the right place.
+ 	 * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ 	 */
+ 	ndx = find_sec(info, ".data..ro_after_init");
+ 	if (ndx)
+ 		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+ 	/*
+ 	 * Mark the __jump_table section as ro_after_init as well: these data
+ 	 * structures are never modified, with the exception of entries that
+ 	 * refer to code in the __init section, which are annotated as such
+ 	 * at module load time.
+ 	 */
+ 	ndx = find_sec(info, "__jump_table");
+ 	if (ndx)
+ 		info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+ 
+ 	/* Determine total sizes, and put offsets in sh_entsize.  For now
+ 	   this is done generically; there doesn't appear to be any
+ 	   special cases for the architectures. */
+ 	layout_sections(info->mod, info);
+ 	layout_symtab(info->mod, info);
+ 
+ 	/* Allocate and move to the final place */
+ 	err = move_module(info->mod, info);
+ 	if (err)
+ 		return ERR_PTR(err);
+ 
+ 	/* Module has been copied to its final place now: return it. */
+ 	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ 	kmemleak_load_module(mod, info);
+ 	return mod;
+ }
+ 
+ /* mod is no longer valid after this! */
+ static void module_deallocate(struct module *mod, struct load_info *info)
+ {
+ 	percpu_modfree(mod);
+ 	module_arch_freeing_init(mod);
+ 	module_memfree(mod->init_layout.base);
+ 	module_memfree(mod->core_layout.base);
+ }
+ 
+ int __weak module_finalize(const Elf_Ehdr *hdr,
+ 			   const Elf_Shdr *sechdrs,
+ 			   struct module *me)
+ {
+ 	return 0;
+ }
+ 
+ static int post_relocation(struct module *mod, const struct load_info *info)
+ {
+ 	/* Sort exception table now relocations are done. */
+ 	sort_extable(mod->extable, mod->extable + mod->num_exentries);
+ 
+ 	/* Copy relocated percpu area over. */
+ 	percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ 		       info->sechdrs[info->index.pcpu].sh_size);
+ 
+ 	/* Setup kallsyms-specific fields. */
+ 	add_kallsyms(mod, info);
+ 
+ 	/* Arch-specific module finalizing. */
+ 	return module_finalize(info->hdr, info->sechdrs, mod);
+ }
+ 
+ /* Is this module of this name done loading?  No locks held. */
+ static bool finished_loading(const char *name)
+ {
+ 	struct module *mod;
+ 	bool ret;
+ 
+ 	/*
+ 	 * The module_mutex should not be a heavily contended lock;
+ 	 * if we get the occasional sleep here, we'll go an extra iteration
+ 	 * in the wait_event_interruptible(), which is harmless.
+ 	 */
+ 	sched_annotate_sleep();
+ 	mutex_lock(&module_mutex);
+ 	mod = find_module_all(name, strlen(name), true);
+ 	ret = !mod || mod->state == MODULE_STATE_LIVE;
+ 	mutex_unlock(&module_mutex);
+ 
+ 	return ret;
+ }
+ 
+ /* Call module constructors. */
+ static void do_mod_ctors(struct module *mod)
+ {
+ #ifdef CONFIG_CONSTRUCTORS
+ 	unsigned long i;
+ 
+ 	for (i = 0; i < mod->num_ctors; i++)
+ 		mod->ctors[i]();
+ #endif
+ }
+ 
+ /* For freeing module_init on success, in case kallsyms traversing */
+ struct mod_initfree {
+ 	struct llist_node node;
+ 	void *module_init;
+ };
+ 
+ static void do_free_init(struct work_struct *w)
+ {
+ 	struct llist_node *pos, *n, *list;
+ 	struct mod_initfree *initfree;
+ 
+ 	list = llist_del_all(&init_free_list);
+ 
+ 	synchronize_rcu();
+ 
+ 	llist_for_each_safe(pos, n, list) {
+ 		initfree = container_of(pos, struct mod_initfree, node);
+ 		module_memfree(initfree->module_init);
+ 		kfree(initfree);
+ 	}
+ }
+ 
+ /*
+  * This is where the real work happens.
+  *
+  * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+  * helper command 'lx-symbols'.
+  */
+ static noinline int do_init_module(struct module *mod)
+ {
+ 	int ret = 0;
+ 	struct mod_initfree *freeinit;
+ 
+ 	freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
+ 	if (!freeinit) {
+ 		ret = -ENOMEM;
+ 		goto fail;
+ 	}
+ 	freeinit->module_init = mod->init_layout.base;
+ 
+ 	/*
+ 	 * We want to find out whether @mod uses async during init.  Clear
+ 	 * PF_USED_ASYNC.  async_schedule*() will set it.
+ 	 */
+ 	current->flags &= ~PF_USED_ASYNC;
+ 
+ 	do_mod_ctors(mod);
+ 	/* Start the module */
+ 	if (mod->init != NULL)
+ 		ret = do_one_initcall(mod->init);
+ 	if (ret < 0) {
+ 		goto fail_free_freeinit;
+ 	}
+ 	if (ret > 0) {
+ 		pr_warn("%s: '%s'->init suspiciously returned %d, it should "
+ 			"follow 0/-E convention\n"
+ 			"%s: loading module anyway...\n",
+ 			__func__, mod->name, ret, __func__);
+ 		dump_stack();
+ 	}
+ 
+ 	/* Now it's a first class citizen! */
+ 	mod->state = MODULE_STATE_LIVE;
+ 	blocking_notifier_call_chain(&module_notify_list,
+ 				     MODULE_STATE_LIVE, mod);
+ 
+ 	/* Delay uevent until module has finished its init routine */
+ 	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+ 
+ 	/*
+ 	 * We need to finish all async code before the module init sequence
+ 	 * is done.  This has potential to deadlock.  For example, a newly
+ 	 * detected block device can trigger request_module() of the
+ 	 * default iosched from async probing task.  Once userland helper
+ 	 * reaches here, async_synchronize_full() will wait on the async
+ 	 * task waiting on request_module() and deadlock.
+ 	 *
+ 	 * This deadlock is avoided by perfomring async_synchronize_full()
+ 	 * iff module init queued any async jobs.  This isn't a full
+ 	 * solution as it will deadlock the same if module loading from
+ 	 * async jobs nests more than once; however, due to the various
+ 	 * constraints, this hack seems to be the best option for now.
+ 	 * Please refer to the following thread for details.
+ 	 *
+ 	 * http://thread.gmane.org/gmane.linux.kernel/1420814
+ 	 */
+ 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
+ 		async_synchronize_full();
+ 
+ 	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+ 			mod->init_layout.size);
+ 	mutex_lock(&module_mutex);
+ 	/* Drop initial reference. */
+ 	module_put(mod);
+ 	trim_init_extable(mod);
+ #ifdef CONFIG_KALLSYMS
+ 	/* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ 	rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
+ #endif
+ 	module_enable_ro(mod, true);
+ 	mod_tree_remove_init(mod);
+ 	module_arch_freeing_init(mod);
+ 	mod->init_layout.base = NULL;
+ 	mod->init_layout.size = 0;
+ 	mod->init_layout.ro_size = 0;
+ 	mod->init_layout.ro_after_init_size = 0;
+ 	mod->init_layout.text_size = 0;
+ 	/*
+ 	 * We want to free module_init, but be aware that kallsyms may be
+ 	 * walking this with preempt disabled.  In all the failure paths, we
+ 	 * call synchronize_rcu(), but we don't want to slow down the success
+ 	 * path. module_memfree() cannot be called in an interrupt, so do the
+ 	 * work and call synchronize_rcu() in a work queue.
+ 	 *
+ 	 * Note that module_alloc() on most architectures creates W+X page
+ 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
+ 	 * code such as mark_rodata_ro() which depends on those mappings to
+ 	 * be cleaned up needs to sync with the queued work - ie
+ 	 * rcu_barrier()
+ 	 */
+ 	if (llist_add(&freeinit->node, &init_free_list))
+ 		schedule_work(&init_free_wq);
+ 
+ 	mutex_unlock(&module_mutex);
+ 	wake_up_all(&module_wq);
+ 
+ 	return 0;
+ 
+ fail_free_freeinit:
+ 	kfree(freeinit);
+ fail:
+ 	/* Try to protect us from buggy refcounters. */
+ 	mod->state = MODULE_STATE_GOING;
+ 	synchronize_rcu();
+ 	module_put(mod);
+ 	blocking_notifier_call_chain(&module_notify_list,
+ 				     MODULE_STATE_GOING, mod);
+ 	klp_module_going(mod);
+ 	ftrace_release_mod(mod);
+ 	free_module(mod);
+ 	wake_up_all(&module_wq);
+ 	return ret;
+ }
+ 
+ static int may_init_module(void)
+ {
+ 	if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ 		return -EPERM;
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * We try to place it in the list now to make sure it's unique before
+  * we dedicate too many resources.  In particular, temporary percpu
+  * memory exhaustion.
+  */
+ static int add_unformed_module(struct module *mod)
+ {
+ 	int err;
+ 	struct module *old;
+ 
+ 	mod->state = MODULE_STATE_UNFORMED;
+ 
+ again:
+ 	mutex_lock(&module_mutex);
+ 	old = find_module_all(mod->name, strlen(mod->name), true);
+ 	if (old != NULL) {
+ 		if (old->state != MODULE_STATE_LIVE) {
+ 			/* Wait in case it fails to load. */
+ 			mutex_unlock(&module_mutex);
+ 			err = wait_event_interruptible(module_wq,
+ 					       finished_loading(mod->name));
+ 			if (err)
+ 				goto out_unlocked;
+ 			goto again;
+ 		}
+ 		err = -EEXIST;
+ 		goto out;
+ 	}
+ 	mod_update_bounds(mod);
+ 	list_add_rcu(&mod->list, &modules);
+ 	mod_tree_insert(mod);
+ 	err = 0;
+ 
+ out:
+ 	mutex_unlock(&module_mutex);
+ out_unlocked:
+ 	return err;
+ }
+ 
+ static int complete_formation(struct module *mod, struct load_info *info)
+ {
+ 	int err;
+ 
+ 	mutex_lock(&module_mutex);
+ 
+ 	/* Find duplicate symbols (must be called under lock). */
+ 	err = verify_exported_symbols(mod);
+ 	if (err < 0)
+ 		goto out;
+ 
+ 	/* This relies on module_mutex for list integrity. */
+ 	module_bug_finalize(info->hdr, info->sechdrs, mod);
+ 
+ 	module_enable_ro(mod, false);
+ 	module_enable_nx(mod);
+ 	module_enable_x(mod);
+ 
+ 	/* Mark state as coming so strong_try_module_get() ignores us,
+ 	 * but kallsyms etc. can see us. */
+ 	mod->state = MODULE_STATE_COMING;
+ 	mutex_unlock(&module_mutex);
+ 
+ 	return 0;
+ 
+ out:
+ 	mutex_unlock(&module_mutex);
+ 	return err;
+ }
+ 
+ static int prepare_coming_module(struct module *mod)
+ {
+ 	int err;
+ 
+ 	ftrace_module_enable(mod);
+ 	err = klp_module_coming(mod);
+ 	if (err)
+ 		return err;
+ 
+ 	blocking_notifier_call_chain(&module_notify_list,
+ 				     MODULE_STATE_COMING, mod);
+ 	return 0;
+ }
+ 
+ static int unknown_module_param_cb(char *param, char *val, const char *modname,
+ 				   void *arg)
+ {
+ 	struct module *mod = arg;
+ 	int ret;
+ 
+ 	if (strcmp(param, "async_probe") == 0) {
+ 		mod->async_probe_requested = true;
+ 		return 0;
+ 	}
+ 
+ 	/* Check for magic 'dyndbg' arg */
+ 	ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+ 	if (ret != 0)
+ 		pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
+ 	return 0;
+ }
+ 
+ /* Allocate and load the module: note that size of section 0 is always
+    zero, and we rely on this for optional sections. */
+ static int load_module(struct load_info *info, const char __user *uargs,
+ 		       int flags)
+ {
+ 	struct module *mod;
+ 	long err = 0;
+ 	char *after_dashes;
+ 
+ 	/*
+ 	 * Do the signature check (if any) first. All that
+ 	 * the signature check needs is info->len, it does
+ 	 * not need any of the section info. That can be
+ 	 * set up later. This will minimize the chances
+ 	 * of a corrupt module causing problems before
+ 	 * we even get to the signature check.
+ 	 *
+ 	 * The check will also adjust info->len by stripping
+ 	 * off the sig length at the end of the module, making
+ 	 * checks against info->len more correct.
+ 	 */
+ 	err = module_sig_check(info, flags);
+ 	if (err)
+ 		goto free_copy;
+ 
+ 	/*
+ 	 * Do basic sanity checks against the ELF header and
+ 	 * sections.
+ 	 */
+ 	err = elf_validity_check(info);
+ 	if (err) {
+ 		pr_err("Module has invalid ELF structures\n");
+ 		goto free_copy;
+ 	}
+ 
+ 	/*
+ 	 * Everything checks out, so set up the section info
+ 	 * in the info structure.
+ 	 */
+ 	err = setup_load_info(info, flags);
+ 	if (err)
+ 		goto free_copy;
+ 
+ 	/*
+ 	 * Now that we know we have the correct module name, check
+ 	 * if it's blacklisted.
+ 	 */
+ 	if (blacklisted(info->name)) {
+ 		err = -EPERM;
+ 		goto free_copy;
+ 	}
+ 
+ 	err = rewrite_section_headers(info, flags);
+ 	if (err)
+ 		goto free_copy;
+ 
+ 	/* Check module struct version now, before we try to use module. */
+ 	if (!check_modstruct_version(info, info->mod)) {
+ 		err = -ENOEXEC;
+ 		goto free_copy;
+ 	}
+ 
+ 	/* Figure out module layout, and allocate all the memory. */
+ 	mod = layout_and_allocate(info, flags);
+ 	if (IS_ERR(mod)) {
+ 		err = PTR_ERR(mod);
+ 		goto free_copy;
+ 	}
+ 
+ 	audit_log_kern_module(mod->name);
+ 
+ 	/* Reserve our place in the list. */
+ 	err = add_unformed_module(mod);
+ 	if (err)
+ 		goto free_module;
+ 
+ #ifdef CONFIG_MODULE_SIG
+ 	mod->sig_ok = info->sig_ok;
+ 	if (!mod->sig_ok) {
+ 		pr_notice_once("%s: module verification failed: signature "
+ 			       "and/or required key missing - tainting "
+ 			       "kernel\n", mod->name);
+ 		add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ 	}
+ #endif
+ 
+ 	/* To avoid stressing percpu allocator, do this once we're unique. */
+ 	err = percpu_modalloc(mod, info);
+ 	if (err)
+ 		goto unlink_mod;
+ 
+ 	/* Now module is in final location, initialize linked lists, etc. */
+ 	err = module_unload_init(mod);
+ 	if (err)
+ 		goto unlink_mod;
+ 
+ 	init_param_lock(mod);
+ 
+ 	/* Now we've got everything in the final locations, we can
+ 	 * find optional sections. */
+ 	err = find_module_sections(mod, info);
+ 	if (err)
+ 		goto free_unload;
+ 
+ 	err = check_module_license_and_versions(mod);
+ 	if (err)
+ 		goto free_unload;
+ 
+ 	/* Set up MODINFO_ATTR fields */
+ 	setup_modinfo(mod, info);
+ 
+ 	/* Fix up syms, so that st_value is a pointer to location. */
+ 	err = simplify_symbols(mod, info);
+ 	if (err < 0)
+ 		goto free_modinfo;
+ 
+ 	err = apply_relocations(mod, info);
+ 	if (err < 0)
+ 		goto free_modinfo;
+ 
+ 	err = post_relocation(mod, info);
+ 	if (err < 0)
+ 		goto free_modinfo;
+ 
+ 	flush_module_icache(mod);
+ 
+ 	/* Now copy in args */
+ 	mod->args = strndup_user(uargs, ~0UL >> 1);
+ 	if (IS_ERR(mod->args)) {
+ 		err = PTR_ERR(mod->args);
+ 		goto free_arch_cleanup;
+ 	}
+ 
+ 	dynamic_debug_setup(mod, info->debug, info->num_debug);
+ 
+ 	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
+ 	ftrace_module_init(mod);
+ 
+ 	/* Finally it's fully formed, ready to start executing. */
+ 	err = complete_formation(mod, info);
+ 	if (err)
+ 		goto ddebug_cleanup;
+ 
+ 	err = prepare_coming_module(mod);
+ 	if (err)
+ 		goto bug_cleanup;
+ 
+ 	/* Module is ready to execute: parsing args may do that. */
+ 	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ 				  -32768, 32767, mod,
+ 				  unknown_module_param_cb);
+ 	if (IS_ERR(after_dashes)) {
+ 		err = PTR_ERR(after_dashes);
+ 		goto coming_cleanup;
+ 	} else if (after_dashes) {
+ 		pr_warn("%s: parameters '%s' after `--' ignored\n",
+ 		       mod->name, after_dashes);
+ 	}
+ 
+ 	/* Link in to sysfs. */
+ 	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
+ 	if (err < 0)
+ 		goto coming_cleanup;
+ 
+ 	if (is_livepatch_module(mod)) {
+ 		err = copy_module_elf(mod, info);
+ 		if (err < 0)
+ 			goto sysfs_cleanup;
+ 	}
+ 
+ 	/* Get rid of temporary copy. */
+ 	free_copy(info);
+ 
+ 	/* Done! */
+ 	trace_module_load(mod);
+ 
+ 	return do_init_module(mod);
+ 
+  sysfs_cleanup:
+ 	mod_sysfs_teardown(mod);
+  coming_cleanup:
+ 	mod->state = MODULE_STATE_GOING;
+ 	destroy_params(mod->kp, mod->num_kp);
+ 	blocking_notifier_call_chain(&module_notify_list,
+ 				     MODULE_STATE_GOING, mod);
+ 	klp_module_going(mod);
+  bug_cleanup:
+ 	mod->state = MODULE_STATE_GOING;
+ 	/* module_bug_cleanup needs module_mutex protection */
+ 	mutex_lock(&module_mutex);
+ 	module_bug_cleanup(mod);
+ 	mutex_unlock(&module_mutex);
+ 
+  ddebug_cleanup:
+ 	ftrace_release_mod(mod);
+ 	dynamic_debug_remove(mod, info->debug);
+ 	synchronize_rcu();
+ 	kfree(mod->args);
+  free_arch_cleanup:
+ 	module_arch_cleanup(mod);
+  free_modinfo:
+ 	free_modinfo(mod);
+  free_unload:
+ 	module_unload_free(mod);
+  unlink_mod:
+ 	mutex_lock(&module_mutex);
+ 	/* Unlink carefully: kallsyms could be walking list. */
+ 	list_del_rcu(&mod->list);
+ 	mod_tree_remove(mod);
+ 	wake_up_all(&module_wq);
+ 	/* Wait for RCU-sched synchronizing before releasing mod->list. */
+ 	synchronize_rcu();
+ 	mutex_unlock(&module_mutex);
+  free_module:
+ 	/* Free lock-classes; relies on the preceding sync_rcu() */
+ 	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
+ 
+ 	module_deallocate(mod, info);
+  free_copy:
+ 	free_copy(info);
+ 	return err;
+ }
+ 
+ SYSCALL_DEFINE3(init_module, void __user *, umod,
+ 		unsigned long, len, const char __user *, uargs)
+ {
+ 	int err;
+ 	struct load_info info = { };
+ 
+ 	err = may_init_module();
+ 	if (err)
+ 		return err;
+ 
+ 	pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ 	       umod, len, uargs);
+ 
+ 	err = copy_module_from_user(umod, len, &info);
+ 	if (err)
+ 		return err;
+ 
+ 	return load_module(&info, uargs, 0);
+ }
+ 
+ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+ {
+ 	struct load_info info = { };
+ 	loff_t size;
+ 	void *hdr;
+ 	int err;
+ 
+ 	err = may_init_module();
+ 	if (err)
+ 		return err;
+ 
+ 	pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
+ 
+ 	if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ 		      |MODULE_INIT_IGNORE_VERMAGIC))
+ 		return -EINVAL;
+ 
+ 	err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX,
+ 				       READING_MODULE);
+ 	if (err)
+ 		return err;
+ 	info.hdr = hdr;
+ 	info.len = size;
+ 
+ 	return load_module(&info, uargs, flags);
+ }
+ 
+ static inline int within(unsigned long addr, void *start, unsigned long size)
+ {
+ 	return ((void *)addr >= start && (void *)addr < start + size);
+ }
+ 
+ #ifdef CONFIG_KALLSYMS
+ /*
+  * This ignores the intensely annoying "mapping symbols" found
+  * in ARM ELF files: $a, $t and $d.
+  */
+ static inline int is_arm_mapping_symbol(const char *str)
+ {
+ 	if (str[0] == '.' && str[1] == 'L')
+ 		return true;
+ 	return str[0] == '$' && strchr("axtd", str[1])
+ 	       && (str[2] == '\0' || str[2] == '.');
+ }
+ 
+ static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
+ {
+ 	return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+ }
+ 
+ /*
+  * Given a module and address, find the corresponding symbol and return its name
+  * while providing its size and offset if needed.
+  */
+ static const char *find_kallsyms_symbol(struct module *mod,
+ 					unsigned long addr,
+ 					unsigned long *size,
+ 					unsigned long *offset)
+ {
+ 	unsigned int i, best = 0;
+ 	unsigned long nextval, bestval;
+ 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ 
+ 	/* At worse, next value is at end of module */
+ 	if (within_module_init(addr, mod))
+ 		nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size;
+ 	else
+ 		nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size;
+ 
+ 	bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
+ 
+ 	/* Scan for closest preceding symbol, and next symbol. (ELF
+ 	   starts real symbols at 1). */
+ 	for (i = 1; i < kallsyms->num_symtab; i++) {
+ 		const Elf_Sym *sym = &kallsyms->symtab[i];
+ 		unsigned long thisval = kallsyms_symbol_value(sym);
+ 
+ 		if (sym->st_shndx == SHN_UNDEF)
+ 			continue;
+ 
+ 		/* We ignore unnamed symbols: they're uninformative
+ 		 * and inserted at a whim. */
+ 		if (*kallsyms_symbol_name(kallsyms, i) == '\0'
+ 		    || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ 			continue;
+ 
+ 		if (thisval <= addr && thisval > bestval) {
+ 			best = i;
+ 			bestval = thisval;
+ 		}
+ 		if (thisval > addr && thisval < nextval)
+ 			nextval = thisval;
+ 	}
+ 
+ 	if (!best)
+ 		return NULL;
+ 
+ 	if (size)
+ 		*size = nextval - bestval;
+ 	if (offset)
+ 		*offset = addr - bestval;
+ 
+ 	return kallsyms_symbol_name(kallsyms, best);
+ }
+ 
+ void * __weak dereference_module_function_descriptor(struct module *mod,
+ 						     void *ptr)
+ {
+ 	return ptr;
+ }
+ 
+ /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
+  * not to lock to avoid deadlock on oopses, simply disable preemption. */
+ const char *module_address_lookup(unsigned long addr,
+ 			    unsigned long *size,
+ 			    unsigned long *offset,
+ 			    char **modname,
+ 			    char *namebuf)
+ {
+ 	const char *ret = NULL;
+ 	struct module *mod;
+ 
+ 	preempt_disable();
+ 	mod = __module_address(addr);
+ 	if (mod) {
+ 		if (modname)
+ 			*modname = mod->name;
+ 
+ 		ret = find_kallsyms_symbol(mod, addr, size, offset);
+ 	}
+ 	/* Make a copy in here where it's safe */
+ 	if (ret) {
+ 		strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
+ 		ret = namebuf;
+ 	}
+ 	preempt_enable();
+ 
+ 	return ret;
+ }
+ 
+ int lookup_module_symbol_name(unsigned long addr, char *symname)
+ {
+ 	struct module *mod;
+ 
+ 	preempt_disable();
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		if (within_module(addr, mod)) {
+ 			const char *sym;
+ 
+ 			sym = find_kallsyms_symbol(mod, addr, NULL, NULL);
+ 			if (!sym)
+ 				goto out;
+ 
+ 			strlcpy(symname, sym, KSYM_NAME_LEN);
+ 			preempt_enable();
+ 			return 0;
+ 		}
+ 	}
+ out:
+ 	preempt_enable();
+ 	return -ERANGE;
+ }
+ 
+ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ 			unsigned long *offset, char *modname, char *name)
+ {
+ 	struct module *mod;
+ 
+ 	preempt_disable();
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		if (within_module(addr, mod)) {
+ 			const char *sym;
+ 
+ 			sym = find_kallsyms_symbol(mod, addr, size, offset);
+ 			if (!sym)
+ 				goto out;
+ 			if (modname)
+ 				strlcpy(modname, mod->name, MODULE_NAME_LEN);
+ 			if (name)
+ 				strlcpy(name, sym, KSYM_NAME_LEN);
+ 			preempt_enable();
+ 			return 0;
+ 		}
+ 	}
+ out:
+ 	preempt_enable();
+ 	return -ERANGE;
+ }
+ 
+ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ 			char *name, char *module_name, int *exported)
+ {
+ 	struct module *mod;
+ 
+ 	preempt_disable();
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		struct mod_kallsyms *kallsyms;
+ 
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		kallsyms = rcu_dereference_sched(mod->kallsyms);
+ 		if (symnum < kallsyms->num_symtab) {
+ 			const Elf_Sym *sym = &kallsyms->symtab[symnum];
+ 
+ 			*value = kallsyms_symbol_value(sym);
+ 			*type = kallsyms->typetab[symnum];
+ 			strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
+ 			strlcpy(module_name, mod->name, MODULE_NAME_LEN);
+ 			*exported = is_exported(name, *value, mod);
+ 			preempt_enable();
+ 			return 0;
+ 		}
+ 		symnum -= kallsyms->num_symtab;
+ 	}
+ 	preempt_enable();
+ 	return -ERANGE;
+ }
+ 
+ /* Given a module and name of symbol, find and return the symbol's value */
+ static unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+ {
+ 	unsigned int i;
+ 	struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ 
+ 	for (i = 0; i < kallsyms->num_symtab; i++) {
+ 		const Elf_Sym *sym = &kallsyms->symtab[i];
+ 
+ 		if (strcmp(name, kallsyms_symbol_name(kallsyms, i)) == 0 &&
+ 		    sym->st_shndx != SHN_UNDEF)
+ 			return kallsyms_symbol_value(sym);
+ 	}
+ 	return 0;
+ }
+ 
+ /* Look for this name: can be of form module:name. */
+ unsigned long module_kallsyms_lookup_name(const char *name)
+ {
+ 	struct module *mod;
+ 	char *colon;
+ 	unsigned long ret = 0;
+ 
+ 	/* Don't lock: we're in enough trouble already. */
+ 	preempt_disable();
+ 	if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+ 		if ((mod = find_module_all(name, colon - name, false)) != NULL)
+ 			ret = find_kallsyms_symbol_value(mod, colon+1);
+ 	} else {
+ 		list_for_each_entry_rcu(mod, &modules, list) {
+ 			if (mod->state == MODULE_STATE_UNFORMED)
+ 				continue;
+ 			if ((ret = find_kallsyms_symbol_value(mod, name)) != 0)
+ 				break;
+ 		}
+ 	}
+ 	preempt_enable();
+ 	return ret;
+ }
+ 
+ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ 					     struct module *, unsigned long),
+ 				   void *data)
+ {
+ 	struct module *mod;
+ 	unsigned int i;
+ 	int ret;
+ 
+ 	module_assert_mutex();
+ 
+ 	list_for_each_entry(mod, &modules, list) {
+ 		/* We hold module_mutex: no need for rcu_dereference_sched */
+ 		struct mod_kallsyms *kallsyms = mod->kallsyms;
+ 
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		for (i = 0; i < kallsyms->num_symtab; i++) {
+ 			const Elf_Sym *sym = &kallsyms->symtab[i];
+ 
+ 			if (sym->st_shndx == SHN_UNDEF)
+ 				continue;
+ 
+ 			ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ 				 mod, kallsyms_symbol_value(sym));
+ 			if (ret != 0)
+ 				return ret;
+ 		}
+ 	}
+ 	return 0;
+ }
+ #endif /* CONFIG_KALLSYMS */
+ 
+ /* Maximum number of characters written by module_flags() */
+ #define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+ 
+ /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
+ static char *module_flags(struct module *mod, char *buf)
+ {
+ 	int bx = 0;
+ 
+ 	BUG_ON(mod->state == MODULE_STATE_UNFORMED);
+ 	if (mod->taints ||
+ 	    mod->state == MODULE_STATE_GOING ||
+ 	    mod->state == MODULE_STATE_COMING) {
+ 		buf[bx++] = '(';
+ 		bx += module_flags_taint(mod, buf + bx);
+ 		/* Show a - for module-is-being-unloaded */
+ 		if (mod->state == MODULE_STATE_GOING)
+ 			buf[bx++] = '-';
+ 		/* Show a + for module-is-being-loaded */
+ 		if (mod->state == MODULE_STATE_COMING)
+ 			buf[bx++] = '+';
+ 		buf[bx++] = ')';
+ 	}
+ 	buf[bx] = '\0';
+ 
+ 	return buf;
+ }
+ 
+ #ifdef CONFIG_PROC_FS
+ /* Called by the /proc file system to return a list of modules. */
+ static void *m_start(struct seq_file *m, loff_t *pos)
+ {
+ 	mutex_lock(&module_mutex);
+ 	return seq_list_start(&modules, *pos);
+ }
+ 
+ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
+ {
+ 	return seq_list_next(p, &modules, pos);
+ }
+ 
+ static void m_stop(struct seq_file *m, void *p)
+ {
+ 	mutex_unlock(&module_mutex);
+ }
+ 
+ static int m_show(struct seq_file *m, void *p)
+ {
+ 	struct module *mod = list_entry(p, struct module, list);
+ 	char buf[MODULE_FLAGS_BUF_SIZE];
+ 	void *value;
+ 
+ 	/* We always ignore unformed modules. */
+ 	if (mod->state == MODULE_STATE_UNFORMED)
+ 		return 0;
+ 
+ 	seq_printf(m, "%s %u",
+ 		   mod->name, mod->init_layout.size + mod->core_layout.size);
+ 	print_unload_info(m, mod);
+ 
+ 	/* Informative for users. */
+ 	seq_printf(m, " %s",
+ 		   mod->state == MODULE_STATE_GOING ? "Unloading" :
+ 		   mod->state == MODULE_STATE_COMING ? "Loading" :
+ 		   "Live");
+ 	/* Used by oprofile and other similar tools. */
+ 	value = m->private ? NULL : mod->core_layout.base;
+ 	seq_printf(m, " 0x%px", value);
+ 
+ 	/* Taints info */
+ 	if (mod->taints)
+ 		seq_printf(m, " %s", module_flags(mod, buf));
+ 
+ 	seq_puts(m, "\n");
+ 	return 0;
+ }
+ 
+ /* Format: modulename size refcount deps address
+ 
+    Where refcount is a number or -, and deps is a comma-separated list
+    of depends or -.
+ */
+ static const struct seq_operations modules_op = {
+ 	.start	= m_start,
+ 	.next	= m_next,
+ 	.stop	= m_stop,
+ 	.show	= m_show
+ };
+ 
+ /*
+  * This also sets the "private" pointer to non-NULL if the
+  * kernel pointers should be hidden (so you can just test
+  * "m->private" to see if you should keep the values private).
+  *
+  * We use the same logic as for /proc/kallsyms.
+  */
+ static int modules_open(struct inode *inode, struct file *file)
+ {
+ 	int err = seq_open(file, &modules_op);
+ 
+ 	if (!err) {
+ 		struct seq_file *m = file->private_data;
+ 		m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul;
+ 	}
+ 
+ 	return err;
+ }
+ 
+ static const struct file_operations proc_modules_operations = {
+ 	.open		= modules_open,
+ 	.read		= seq_read,
+ 	.llseek		= seq_lseek,
+ 	.release	= seq_release,
+ };
+ 
+ static int __init proc_modules_init(void)
+ {
+ 	proc_create("modules", 0, NULL, &proc_modules_operations);
+ 	return 0;
+ }
+ module_init(proc_modules_init);
+ #endif
+ 
+ /* Given an address, look for it in the module exception tables. */
+ const struct exception_table_entry *search_module_extables(unsigned long addr)
+ {
+ 	const struct exception_table_entry *e = NULL;
+ 	struct module *mod;
+ 
+ 	preempt_disable();
+ 	mod = __module_address(addr);
+ 	if (!mod)
+ 		goto out;
+ 
+ 	if (!mod->num_exentries)
+ 		goto out;
+ 
+ 	e = search_extable(mod->extable,
+ 			   mod->num_exentries,
+ 			   addr);
+ out:
+ 	preempt_enable();
+ 
+ 	/*
+ 	 * Now, if we found one, we are running inside it now, hence
+ 	 * we cannot unload the module, hence no refcnt needed.
+ 	 */
+ 	return e;
+ }
+ 
+ /*
+  * is_module_address - is this address inside a module?
+  * @addr: the address to check.
+  *
+  * See is_module_text_address() if you simply want to see if the address
+  * is code (not data).
+  */
+ bool is_module_address(unsigned long addr)
+ {
+ 	bool ret;
+ 
+ 	preempt_disable();
+ 	ret = __module_address(addr) != NULL;
+ 	preempt_enable();
+ 
+ 	return ret;
+ }
+ 
+ /*
+  * __module_address - get the module which contains an address.
+  * @addr: the address.
+  *
+  * Must be called with preempt disabled or module mutex held so that
+  * module doesn't get freed during this.
+  */
+ struct module *__module_address(unsigned long addr)
+ {
+ 	struct module *mod;
+ 
+ 	if (addr < module_addr_min || addr > module_addr_max)
+ 		return NULL;
+ 
+ 	module_assert_mutex_or_preempt();
+ 
+ 	mod = mod_find(addr);
+ 	if (mod) {
+ 		BUG_ON(!within_module(addr, mod));
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			mod = NULL;
+ 	}
+ 	return mod;
+ }
+ EXPORT_SYMBOL_GPL(__module_address);
+ 
+ /*
+  * is_module_text_address - is this address inside module code?
+  * @addr: the address to check.
+  *
+  * See is_module_address() if you simply want to see if the address is
+  * anywhere in a module.  See kernel_text_address() for testing if an
+  * address corresponds to kernel or module code.
+  */
+ bool is_module_text_address(unsigned long addr)
+ {
+ 	bool ret;
+ 
+ 	preempt_disable();
+ 	ret = __module_text_address(addr) != NULL;
+ 	preempt_enable();
+ 
+ 	return ret;
+ }
+ 
+ /*
+  * __module_text_address - get the module whose code contains an address.
+  * @addr: the address.
+  *
+  * Must be called with preempt disabled or module mutex held so that
+  * module doesn't get freed during this.
+  */
+ struct module *__module_text_address(unsigned long addr)
+ {
+ 	struct module *mod = __module_address(addr);
+ 	if (mod) {
+ 		/* Make sure it's within the text section. */
+ 		if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
+ 		    && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
+ 			mod = NULL;
+ 	}
+ 	return mod;
+ }
+ EXPORT_SYMBOL_GPL(__module_text_address);
+ 
+ /* Don't grab lock, we're oopsing. */
+ void print_modules(void)
+ {
+ 	struct module *mod;
+ 	char buf[MODULE_FLAGS_BUF_SIZE];
+ 
+ 	printk(KERN_DEFAULT "Modules linked in:");
+ 	/* Most callers should already have preempt disabled, but make sure */
+ 	preempt_disable();
+ 	list_for_each_entry_rcu(mod, &modules, list) {
+ 		if (mod->state == MODULE_STATE_UNFORMED)
+ 			continue;
+ 		pr_cont(" %s%s", mod->name, module_flags(mod, buf));
+ 	}
+ 	preempt_enable();
+ 	if (last_unloaded_module[0])
+ 		pr_cont(" [last unloaded: %s]", last_unloaded_module);
+ 	pr_cont("\n");
+ }
+ 
+ #ifdef CONFIG_MODVERSIONS
+ /* Generate the signature for all relevant module structures here.
+  * If these change, we don't want to try to parse the module. */
+ void module_layout(struct module *mod,
+ 		   struct modversion_info *ver,
+ 		   struct kernel_param *kp,
+ 		   struct kernel_symbol *ks,
+ 		   struct tracepoint * const *tp)
+ {
+ }
+ EXPORT_SYMBOL(module_layout);
+ #endif
diff --color -rcNP Master/kernel/pid.c OG/kernel/pid.c
*** Master/kernel/pid.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/pid.c	2021-04-20 15:11:34.518000000 -0400
***************
*** 355,360 ****
--- 355,367 ----
  	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
  }
  
+ struct task_struct *find_task_by_vpid_unrestricted(pid_t vnr)
+ {
+ 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ 			 "find_task_by_pid_ns() needs rcu_read_lock() protection");
+ 	return pid_task(find_pid_ns(vnr, task_active_pid_ns(current)), PIDTYPE_PID);
+ }
+ 
  struct task_struct *find_get_task_by_vpid(pid_t nr)
  {
  	struct task_struct *task;
diff --color -rcNP Master/kernel/sched/core.c OG/kernel/sched/core.c
*** Master/kernel/sched/core.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/sched/core.c	2021-04-20 15:11:34.519000000 -0400
***************
*** 11,16 ****
--- 11,17 ----
  #include <linux/nospec.h>
  
  #include <linux/kcov.h>
+ #include <linux/minisec.h>
  
  #include <asm/switch_to.h>
  #include <asm/tlb.h>
***************
*** 4671,4677 ****
  	nice = task_nice(current) + increment;
  
  	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
! 	if (increment < 0 && !can_nice(current, nice))
  		return -EPERM;
  
  	retval = security_task_setnice(current, nice);
--- 4672,4678 ----
  	nice = task_nice(current) + increment;
  
  	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
! 	if ((increment < 0 && !can_nice(current, nice)) || gr_handle_chroot_nice())
  		return -EPERM;
  
  	retval = security_task_setnice(current, nice);
diff --color -rcNP Master/kernel/sched/debug.c OG/kernel/sched/debug.c
*** Master/kernel/sched/debug.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/sched/debug.c	2021-04-20 15:11:34.519000000 -0400
***************
*** 176,183 ****
--- 176,188 ----
  	debugfs_create_file("sched_features", 0644, NULL, NULL,
  			&sched_feat_fops);
  
+ #ifdef CONFIG_MINISEC_PROC_ADD
+ 	debugfs_create_bool("sched_debug", 0400, NULL,
+ 			&sched_debug_enabled);
+ #else
  	debugfs_create_bool("sched_debug", 0644, NULL,
  			&sched_debug_enabled);
+ #endif
  
  	return 0;
  }
diff --color -rcNP Master/kernel/sys.c OG/kernel/sys.c
*** Master/kernel/sys.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/sys.c	2021-04-20 15:11:34.519000000 -0400
***************
*** 62,67 ****
--- 62,68 ----
  #include <linux/cred.h>
  
  #include <linux/nospec.h>
+ #include <linux/minisec.h>
  
  #include <linux/kmsg_dump.h>
  /* Move somewhere else to avoid recompiling? */
***************
*** 181,186 ****
--- 182,193 ----
  		error = -EACCES;
  		goto out;
  	}
+ 
+ 	if (gr_handle_chroot_setpriority(p, niceval)) {
+ 		error = -EACCES;
+ 		goto out;
+ 	}
+ 
  	no_nice = security_task_setnice(p, niceval);
  	if (no_nice) {
  		error = no_nice;
diff --color -rcNP Master/kernel/sysctl.c OG/kernel/sysctl.c
*** Master/kernel/sysctl.c	2021-04-20 14:17:31.000000000 -0400
--- OG/kernel/sysctl.c	2021-04-20 15:11:34.520000000 -0400
***************
*** 70,75 ****
--- 70,79 ----
  #include <linux/userfaultfd_k.h>
  #include <linux/ipc.h>
  
+ #ifdef CONFIG_MINISEC
+ #include <linux/minisec.h>
+ #endif
+ 
  #include "../lib/kstrtox.h"
  
  #include <linux/uaccess.h>
***************
*** 305,310 ****
--- 309,330 ----
  	{ }
  };
  
+ extern struct ctl_table grsecurity_table[];
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ static struct ctl_table pax_table[] = {
+ 	{
+ 		.procname	= "softmode",
+ 		.data		= &pax_softmode,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 
+ 	{ }
+ };
+ #endif
+ 
  #ifdef CONFIG_SCHED_DEBUG
  static int min_sched_granularity_ns __read_only = 100000;		/* 100 usecs */
  static int max_sched_granularity_ns __read_only = NSEC_PER_SEC;	/* 1 second */
***************
*** 322,327 ****
--- 342,375 ----
  #endif
  
  static struct ctl_table kern_table[] = {
+ 	#ifdef CONFIG_MINISEC_SOFTMODE
+ 	{
+ 		.procname	= "pax",
+ 		.mode		= 0500,
+ 		.child		= pax_table,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_SYSCTL
+ 	{
+ 		.procname	= "grsecurity",
+ 		.mode		= 0500,
+ 		.child		= grsecurity_table,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	{
+ 		.procname	= "pax",
+ 		.mode		= 0500,
+ 		.child		= pax_table,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_SYSCTL
+ 	{
+ 		.procname	= "grsecurity",
+ 		.mode		= 0500,
+ 		.child		= grsecurity_table,
+ 	},
+ #endif
  	{
  		.procname	= "sched_child_runs_first",
  		.data		= &sysctl_sched_child_runs_first,
***************
*** 1386,1392 ****
  		.proc_handler	= overcommit_kbytes_handler,
  	},
  	{
! 		.procname	= "page-cluster", 
  		.data		= &page_cluster,
  		.maxlen		= sizeof(int),
  		.mode		= 0644,
--- 1434,1440 ----
  		.proc_handler	= overcommit_kbytes_handler,
  	},
  	{
! 		.procname	= "page-cluster",
  		.data		= &page_cluster,
  		.maxlen		= sizeof(int),
  		.mode		= 0644,
***************
*** 1904,1910 ****
  		.mode		= 0555,
  		.child		= inotify_table,
  	},
! #endif	
  #ifdef CONFIG_EPOLL
  	{
  		.procname	= "epoll",
--- 1952,1958 ----
  		.mode		= 0555,
  		.child		= inotify_table,
  	},
! #endif
  #ifdef CONFIG_EPOLL
  	{
  		.procname	= "epoll",
***************
*** 2385,2396 ****
  	int *i, vleft, first = 1, err = 0;
  	size_t left;
  	char *kbuf = NULL, *p;
! 	
  	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
  		*lenp = 0;
  		return 0;
  	}
! 	
  	i = (int *) tbl_data;
  	vleft = table->maxlen / sizeof(*i);
  	left = *lenp;
--- 2433,2444 ----
  	int *i, vleft, first = 1, err = 0;
  	size_t left;
  	char *kbuf = NULL, *p;
! 
  	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
  		*lenp = 0;
  		return 0;
  	}
! 
  	i = (int *) tbl_data;
  	vleft = table->maxlen / sizeof(*i);
  	left = *lenp;
***************
*** 2616,2622 ****
   * @ppos: file position
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string. 
   *
   * Returns 0 on success.
   */
--- 2664,2670 ----
   * @ppos: file position
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string.
   *
   * Returns 0 on success.
   */
***************
*** 2646,2651 ****
--- 2694,2775 ----
  				 do_proc_douintvec_conv, NULL);
  }
  
+ static int do_proc_dointvec_conv_secure(bool *negp, unsigned long *lvalp,
+ 				 int *valp,
+ 				 int write, void *data)
+ {
+ 	if (write) {
+ 		if (*negp) {
+ 			if (*lvalp > (unsigned long) INT_MAX + 1)
+ 				return -EINVAL;
+ //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ 			*valp = -*lvalp;
+ //			pax_close_kernel();
+ 		} else {
+ 			if (*lvalp > (unsigned long) INT_MAX)
+ 				return -EINVAL;
+ //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ 			*valp = *lvalp;
+ //			pax_close_kernel();
+ 		}
+ 	} else {
+ 		int val = *valp;
+ 		if (val < 0) {
+ 			*negp = true;
+ 			*lvalp = -(unsigned long)val;
+ 		} else {
+ 			*negp = false;
+ 			*lvalp = (unsigned long)val;
+ 		}
+ 	}
+ 	return 0;
+ }
+ 
+ int proc_dointvec_secure(struct ctl_table *table, int write,
+ 		     void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ 				do_proc_dointvec_conv_secure,NULL);
+ }
+ 
+ static int do_proc_dointvec_conv_secure(bool *negp, unsigned long *lvalp,
+ 				 int *valp,
+ 				 int write, void *data)
+ {
+ 	if (write) {
+ 		if (*negp) {
+ 			if (*lvalp > (unsigned long) INT_MAX + 1)
+ 				return -EINVAL;
+ //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ 			*valp = -*lvalp;
+ //			pax_close_kernel();
+ 		} else {
+ 			if (*lvalp > (unsigned long) INT_MAX)
+ 				return -EINVAL;
+ //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ 			*valp = *lvalp;
+ //			pax_close_kernel();
+ 		}
+ 	} else {
+ 		int val = *valp;
+ 		if (val < 0) {
+ 			*negp = true;
+ 			*lvalp = -(unsigned long)val;
+ 		} else {
+ 			*negp = false;
+ 			*lvalp = (unsigned long)val;
+ 		}
+ 	}
+ 	return 0;
+ }
+ 
+ int proc_dointvec_secure(struct ctl_table *table, int write,
+ 		     void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ 				do_proc_dointvec_conv_secure,NULL);
+ }
+ 
  /*
   * Taint values can only be increased
   * This means we can safely use a temporary.
***************
*** 3129,3135 ****
   * @ppos: file position
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string. 
   * The values read are assumed to be in seconds, and are converted into
   * jiffies.
   *
--- 3253,3259 ----
   * @ppos: file position
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string.
   * The values read are assumed to be in seconds, and are converted into
   * jiffies.
   *
***************
*** 3151,3158 ****
   * @ppos: pointer to the file position
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string. 
!  * The values read are assumed to be in 1/USER_HZ seconds, and 
   * are converted into jiffies.
   *
   * Returns 0 on success.
--- 3275,3282 ----
   * @ppos: pointer to the file position
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string.
!  * The values read are assumed to be in 1/USER_HZ seconds, and
   * are converted into jiffies.
   *
   * Returns 0 on success.
***************
*** 3174,3181 ****
   * @ppos: the current position in the file
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string. 
!  * The values read are assumed to be in 1/1000 seconds, and 
   * are converted into jiffies.
   *
   * Returns 0 on success.
--- 3298,3305 ----
   * @ppos: the current position in the file
   *
   * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
!  * values from/to the user buffer, treated as an ASCII string.
!  * The values read are assumed to be in 1/1000 seconds, and
   * are converted into jiffies.
   *
   * Returns 0 on success.
diff --color -rcNP Master/kernel/sysctl.c.orig OG/kernel/sysctl.c.orig
*** Master/kernel/sysctl.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/kernel/sysctl.c.orig	2021-04-20 15:10:45.394000000 -0400
***************
*** 0 ****
--- 1,3549 ----
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  * sysctl.c: General linux system control interface
+  *
+  * Begun 24 March 1995, Stephen Tweedie
+  * Added /proc support, Dec 1995
+  * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
+  * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
+  * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
+  * Dynamic registration fixes, Stephen Tweedie.
+  * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
+  * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
+  *  Horn.
+  * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
+  * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
+  * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
+  *  Wendling.
+  * The list_for_each() macro wasn't appropriate for the sysctl loop.
+  *  Removed it and replaced it with older style, 03/23/00, Bill Wendling
+  */
+ 
+ #include <linux/module.h>
+ #include <linux/aio.h>
+ #include <linux/mm.h>
+ #include <linux/swap.h>
+ #include <linux/slab.h>
+ #include <linux/sysctl.h>
+ #include <linux/bitmap.h>
+ #include <linux/signal.h>
+ #include <linux/printk.h>
+ #include <linux/proc_fs.h>
+ #include <linux/security.h>
+ #include <linux/ctype.h>
+ #include <linux/kmemleak.h>
+ #include <linux/fs.h>
+ #include <linux/init.h>
+ #include <linux/kernel.h>
+ #include <linux/kobject.h>
+ #include <linux/net.h>
+ #include <linux/sysrq.h>
+ #include <linux/highuid.h>
+ #include <linux/writeback.h>
+ #include <linux/ratelimit.h>
+ #include <linux/compaction.h>
+ #include <linux/hugetlb.h>
+ #include <linux/initrd.h>
+ #include <linux/key.h>
+ #include <linux/times.h>
+ #include <linux/limits.h>
+ #include <linux/dcache.h>
+ #include <linux/dnotify.h>
+ #include <linux/syscalls.h>
+ #include <linux/vmstat.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/acpi.h>
+ #include <linux/reboot.h>
+ #include <linux/ftrace.h>
+ #include <linux/perf_event.h>
+ #include <linux/kprobes.h>
+ #include <linux/pipe_fs_i.h>
+ #include <linux/oom.h>
+ #include <linux/kmod.h>
+ #include <linux/capability.h>
+ #include <linux/binfmts.h>
+ #include <linux/sched/sysctl.h>
+ #include <linux/sched/coredump.h>
+ #include <linux/kexec.h>
+ #include <linux/bpf.h>
+ #include <linux/mount.h>
+ #include <linux/userfaultfd_k.h>
+ #include <linux/ipc.h>
+ 
+ #ifdef CONFIG_MINISEC
+ #include <linux/minisec.h>
+ #endif
+ 
+ #include "../lib/kstrtox.h"
+ 
+ #include <linux/uaccess.h>
+ #include <asm/processor.h>
+ 
+ #ifdef CONFIG_X86
+ #include <asm/nmi.h>
+ #include <asm/stacktrace.h>
+ #include <asm/io.h>
+ #endif
+ #ifdef CONFIG_SPARC
+ #include <asm/setup.h>
+ #endif
+ #ifdef CONFIG_BSD_PROCESS_ACCT
+ #include <linux/acct.h>
+ #endif
+ #ifdef CONFIG_RT_MUTEXES
+ #include <linux/rtmutex.h>
+ #endif
+ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+ #include <linux/lockdep.h>
+ #endif
+ #ifdef CONFIG_CHR_DEV_SG
+ #include <scsi/sg.h>
+ #endif
+ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+ #include <linux/stackleak.h>
+ #endif
+ #ifdef CONFIG_LOCKUP_DETECTOR
+ #include <linux/nmi.h>
+ #endif
+ #if defined CONFIG_TTY
+ #include <linux/tty.h>
+ #endif
+ 
+ #if defined(CONFIG_SYSCTL)
+ 
+ /* External variables not in a header file. */
+ extern int suid_dumpable;
+ #ifdef CONFIG_COREDUMP
+ extern int core_uses_pid;
+ extern char core_pattern[];
+ extern unsigned int core_pipe_limit;
+ #endif
+ #ifdef CONFIG_USER_NS
+ extern int unprivileged_userns_clone;
+ #endif
+ extern int pid_max;
+ extern int pid_max_min, pid_max_max;
+ extern int percpu_pagelist_fraction;
+ extern int latencytop_enabled;
+ extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
+ #ifndef CONFIG_MMU
+ extern int sysctl_nr_trim_pages;
+ #endif
+ 
+ /* Constants used for minimum and  maximum */
+ #ifdef CONFIG_LOCKUP_DETECTOR
+ static int sixty __read_only = 60;
+ #endif
+ 
+ static int __maybe_unused neg_one __read_only = -1;
+ static int __maybe_unused two __read_only = 2;
+ static int __maybe_unused four __read_only = 4;
+ static unsigned long zero_ul __read_only;
+ static unsigned long one_ul __read_only = 1;
+ static unsigned long long_max __read_only = LONG_MAX;
+ static int one_hundred __read_only = 100;
+ static int one_thousand __read_only = 1000;
+ #ifdef CONFIG_PRINTK
+ static int ten_thousand __read_only = 10000;
+ #endif
+ #ifdef CONFIG_PERF_EVENTS
+ static int six_hundred_forty_kb __read_only = 640 * 1024;
+ #endif
+ 
+ /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
+ static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE;
+ 
+ /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
+ static int maxolduid __read_only = 65535;
+ static int minolduid __read_only;
+ 
+ static int ngroups_max __read_only = NGROUPS_MAX;
+ static const int cap_last_cap = CAP_LAST_CAP;
+ 
+ /*
+  * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+  * and hung_task_check_interval_secs
+  */
+ #ifdef CONFIG_DETECT_HUNG_TASK
+ static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ);
+ #endif
+ 
+ int device_sidechannel_restrict __read_mostly = 1;
+ EXPORT_SYMBOL(device_sidechannel_restrict);
+ 
+ #ifdef CONFIG_INOTIFY_USER
+ #include <linux/inotify.h>
+ #endif
+ #ifdef CONFIG_SPARC
+ #endif
+ 
+ #ifdef CONFIG_PARISC
+ extern int pwrsw_enabled;
+ #endif
+ 
+ #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
+ extern int unaligned_enabled;
+ #endif
+ 
+ #ifdef CONFIG_IA64
+ extern int unaligned_dump_stack;
+ #endif
+ 
+ #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+ extern int no_unaligned_warning;
+ #endif
+ 
+ #ifdef CONFIG_PROC_SYSCTL
+ 
+ /**
+  * enum sysctl_writes_mode - supported sysctl write modes
+  *
+  * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
+  *	to be written, and multiple writes on the same sysctl file descriptor
+  *	will rewrite the sysctl value, regardless of file position. No warning
+  *	is issued when the initial position is not 0.
+  * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
+  *	not 0.
+  * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
+  *	file position 0 and the value must be fully contained in the buffer
+  *	sent to the write syscall. If dealing with strings respect the file
+  *	position, but restrict this to the max length of the buffer, anything
+  *	passed the max length will be ignored. Multiple writes will append
+  *	to the buffer.
+  *
+  * These write modes control how current file position affects the behavior of
+  * updating sysctl values through the proc interface on each write.
+  */
+ enum sysctl_writes_mode {
+ 	SYSCTL_WRITES_LEGACY		= -1,
+ 	SYSCTL_WRITES_WARN		= 0,
+ 	SYSCTL_WRITES_STRICT		= 1,
+ };
+ 
+ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+ 
+ static int proc_do_cad_pid(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos);
+ static int proc_taint(struct ctl_table *table, int write,
+ 			       void __user *buffer, size_t *lenp, loff_t *ppos);
+ #endif
+ 
+ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ 		void __user *buffer, size_t *lenp, loff_t *ppos);
+ #ifdef CONFIG_COREDUMP
+ static int proc_dostring_coredump(struct ctl_table *table, int write,
+ 		void __user *buffer, size_t *lenp, loff_t *ppos);
+ #endif
+ static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ 		void __user *buffer, size_t *lenp, loff_t *ppos);
+ 
+ #ifdef CONFIG_MAGIC_SYSRQ
+ /* Note: sysrq code uses its own private copy */
+ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
+ 
+ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
+ 				void __user *buffer, size_t *lenp,
+ 				loff_t *ppos)
+ {
+ 	int error;
+ 
+ 	error = proc_dointvec(table, write, buffer, lenp, ppos);
+ 	if (error)
+ 		return error;
+ 
+ 	if (write)
+ 		sysrq_toggle_support(__sysrq_enabled);
+ 
+ 	return 0;
+ }
+ 
+ #endif
+ 
+ static struct ctl_table kern_table[];
+ static struct ctl_table vm_table[];
+ static struct ctl_table fs_table[];
+ static struct ctl_table debug_table[];
+ static struct ctl_table dev_table[];
+ extern struct ctl_table random_table[];
+ #ifdef CONFIG_EPOLL
+ extern struct ctl_table epoll_table[];
+ #endif
+ 
+ #ifdef CONFIG_FW_LOADER_USER_HELPER
+ extern struct ctl_table firmware_config_table[];
+ #endif
+ 
+ #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+     defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+ int sysctl_legacy_va_layout;
+ #endif
+ 
+ /* The default sysctl tables: */
+ 
+ static struct ctl_table sysctl_base_table[] = {
+ 	{
+ 		.procname	= "kernel",
+ 		.mode		= 0555,
+ 		.child		= kern_table,
+ 	},
+ 	{
+ 		.procname	= "vm",
+ 		.mode		= 0555,
+ 		.child		= vm_table,
+ 	},
+ 	{
+ 		.procname	= "fs",
+ 		.mode		= 0555,
+ 		.child		= fs_table,
+ 	},
+ 	{
+ 		.procname	= "debug",
+ 		.mode		= 0555,
+ 		.child		= debug_table,
+ 	},
+ 	{
+ 		.procname	= "dev",
+ 		.mode		= 0555,
+ 		.child		= dev_table,
+ 	},
+ 	{ }
+ };
+ 
+ extern struct ctl_table grsecurity_table[];
+ 
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ static struct ctl_table pax_table[] = {
+ 	{
+ 		.procname	= "softmode",
+ 		.data		= &pax_softmode,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec,
+ 	},
+ 
+ 	{ }
+ };
+ #endif
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+ static int min_sched_granularity_ns __read_only = 100000;		/* 100 usecs */
+ static int max_sched_granularity_ns __read_only = NSEC_PER_SEC;	/* 1 second */
+ static int min_wakeup_granularity_ns __read_only;			/* 0 usecs */
+ static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC;	/* 1 second */
+ #ifdef CONFIG_SMP
+ static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE;
+ static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1;
+ #endif /* CONFIG_SMP */
+ #endif /* CONFIG_SCHED_DEBUG */
+ 
+ #ifdef CONFIG_COMPACTION
+ static int min_extfrag_threshold __read_only;
+ static int max_extfrag_threshold __read_only = 1000;
+ #endif
+ 
+ static struct ctl_table kern_table[] = {
+ 	#ifdef CONFIG_MINISEC_SOFTMODE
+ 	{
+ 		.procname	= "pax",
+ 		.mode		= 0500,
+ 		.child		= pax_table,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_SYSCTL
+ 	{
+ 		.procname	= "grsecurity",
+ 		.mode		= 0500,
+ 		.child		= grsecurity_table,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_SOFTMODE
+ 	{
+ 		.procname	= "pax",
+ 		.mode		= 0500,
+ 		.child		= pax_table,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_SYSCTL
+ 	{
+ 		.procname	= "grsecurity",
+ 		.mode		= 0500,
+ 		.child		= grsecurity_table,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "sched_child_runs_first",
+ 		.data		= &sysctl_sched_child_runs_first,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #ifdef CONFIG_SCHED_DEBUG
+ 	{
+ 		.procname	= "sched_min_granularity_ns",
+ 		.data		= &sysctl_sched_min_granularity,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_proc_update_handler,
+ 		.extra1		= &min_sched_granularity_ns,
+ 		.extra2		= &max_sched_granularity_ns,
+ 	},
+ 	{
+ 		.procname	= "sched_latency_ns",
+ 		.data		= &sysctl_sched_latency,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_proc_update_handler,
+ 		.extra1		= &min_sched_granularity_ns,
+ 		.extra2		= &max_sched_granularity_ns,
+ 	},
+ 	{
+ 		.procname	= "sched_wakeup_granularity_ns",
+ 		.data		= &sysctl_sched_wakeup_granularity,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_proc_update_handler,
+ 		.extra1		= &min_wakeup_granularity_ns,
+ 		.extra2		= &max_wakeup_granularity_ns,
+ 	},
+ #ifdef CONFIG_SMP
+ 	{
+ 		.procname	= "sched_tunable_scaling",
+ 		.data		= &sysctl_sched_tunable_scaling,
+ 		.maxlen		= sizeof(enum sched_tunable_scaling),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_proc_update_handler,
+ 		.extra1		= &min_sched_tunable_scaling,
+ 		.extra2		= &max_sched_tunable_scaling,
+ 	},
+ 	{
+ 		.procname	= "sched_migration_cost_ns",
+ 		.data		= &sysctl_sched_migration_cost,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "sched_nr_migrate",
+ 		.data		= &sysctl_sched_nr_migrate,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #ifdef CONFIG_SCHEDSTATS
+ 	{
+ 		.procname	= "sched_schedstats",
+ 		.data		= NULL,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_schedstats,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif /* CONFIG_SCHEDSTATS */
+ #endif /* CONFIG_SMP */
+ #ifdef CONFIG_NUMA_BALANCING
+ 	{
+ 		.procname	= "numa_balancing_scan_delay_ms",
+ 		.data		= &sysctl_numa_balancing_scan_delay,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "numa_balancing_scan_period_min_ms",
+ 		.data		= &sysctl_numa_balancing_scan_period_min,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "numa_balancing_scan_period_max_ms",
+ 		.data		= &sysctl_numa_balancing_scan_period_max,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "numa_balancing_scan_size_mb",
+ 		.data		= &sysctl_numa_balancing_scan_size,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "numa_balancing",
+ 		.data		= NULL, /* filled in by handler */
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_numa_balancing,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif /* CONFIG_NUMA_BALANCING */
+ #endif /* CONFIG_SCHED_DEBUG */
+ 	{
+ 		.procname	= "sched_rt_period_us",
+ 		.data		= &sysctl_sched_rt_period,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_rt_handler,
+ 	},
+ 	{
+ 		.procname	= "sched_rt_runtime_us",
+ 		.data		= &sysctl_sched_rt_runtime,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_rt_handler,
+ 	},
+ 	{
+ 		.procname	= "sched_rr_timeslice_ms",
+ 		.data		= &sysctl_sched_rr_timeslice,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_rr_handler,
+ 	},
+ #ifdef CONFIG_UCLAMP_TASK
+ 	{
+ 		.procname	= "sched_util_clamp_min",
+ 		.data		= &sysctl_sched_uclamp_util_min,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_sched_uclamp_handler,
+ 	},
+ 	{
+ 		.procname	= "sched_util_clamp_max",
+ 		.data		= &sysctl_sched_uclamp_util_max,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_sched_uclamp_handler,
+ 	},
+ #endif
+ #ifdef CONFIG_SCHED_AUTOGROUP
+ 	{
+ 		.procname	= "sched_autogroup_enabled",
+ 		.data		= &sysctl_sched_autogroup_enabled,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+ 		.procname	= "sched_cfs_bandwidth_slice_us",
+ 		.data		= &sysctl_sched_cfs_bandwidth_slice,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ONE,
+ 	},
+ #endif
+ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ 	{
+ 		.procname	= "sched_energy_aware",
+ 		.data		= &sysctl_sched_energy_aware,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sched_energy_aware_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_PROVE_LOCKING
+ 	{
+ 		.procname	= "prove_locking",
+ 		.data		= &prove_locking,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_LOCK_STAT
+ 	{
+ 		.procname	= "lock_stat",
+ 		.data		= &lock_stat,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "panic",
+ 		.data		= &panic_timeout,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #ifdef CONFIG_COREDUMP
+ 	{
+ 		.procname	= "core_uses_pid",
+ 		.data		= &core_uses_pid,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "core_pattern",
+ 		.data		= core_pattern,
+ 		.maxlen		= CORENAME_MAX_SIZE,
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dostring_coredump,
+ 	},
+ 	{
+ 		.procname	= "core_pipe_limit",
+ 		.data		= &core_pipe_limit,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_USER_NS
+ 	{
+ 		.procname	= "unprivileged_userns_clone",
+ 		.data		= &unprivileged_userns_clone,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_PROC_SYSCTL
+ 	{
+ 		.procname	= "tainted",
+ 		.maxlen 	= sizeof(long),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_taint,
+ 	},
+ 	{
+ 		.procname	= "sysctl_writes_strict",
+ 		.data		= &sysctl_writes_strict,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &neg_one,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_LATENCYTOP
+ 	{
+ 		.procname	= "latencytop",
+ 		.data		= &latencytop_enabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_latencytop,
+ 	},
+ #endif
+ #ifdef CONFIG_BLK_DEV_INITRD
+ 	{
+ 		.procname	= "real-root-dev",
+ 		.data		= &real_root_dev,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "print-fatal-signals",
+ 		.data		= &print_fatal_signals,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #ifdef CONFIG_SPARC
+ 	{
+ 		.procname	= "reboot-cmd",
+ 		.data		= reboot_command,
+ 		.maxlen		= 256,
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dostring,
+ 	},
+ 	{
+ 		.procname	= "stop-a",
+ 		.data		= &stop_a_enabled,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "scons-poweroff",
+ 		.data		= &scons_pwroff,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_SPARC64
+ 	{
+ 		.procname	= "tsb-ratio",
+ 		.data		= &sysctl_tsb_ratio,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_PARISC
+ 	{
+ 		.procname	= "soft-power",
+ 		.data		= &pwrsw_enabled,
+ 		.maxlen		= sizeof (int),
+ 	 	.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
+ 	{
+ 		.procname	= "unaligned-trap",
+ 		.data		= &unaligned_enabled,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "ctrl-alt-del",
+ 		.data		= &C_A_D,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #ifdef CONFIG_FUNCTION_TRACER
+ 	{
+ 		.procname	= "ftrace_enabled",
+ 		.data		= &ftrace_enabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= ftrace_enable_sysctl,
+ 	},
+ #endif
+ #ifdef CONFIG_STACK_TRACER
+ 	{
+ 		.procname	= "stack_tracer_enabled",
+ 		.data		= &stack_tracer_enabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= stack_trace_sysctl,
+ 	},
+ #endif
+ #ifdef CONFIG_TRACING
+ 	{
+ 		.procname	= "ftrace_dump_on_oops",
+ 		.data		= &ftrace_dump_on_oops,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "traceoff_on_warning",
+ 		.data		= &__disable_trace_on_warning,
+ 		.maxlen		= sizeof(__disable_trace_on_warning),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "tracepoint_printk",
+ 		.data		= &tracepoint_printk,
+ 		.maxlen		= sizeof(tracepoint_printk),
+ 		.mode		= 0644,
+ 		.proc_handler	= tracepoint_printk_sysctl,
+ 	},
+ #endif
+ #ifdef CONFIG_KEXEC_CORE
+ 	{
+ 		.procname	= "kexec_load_disabled",
+ 		.data		= &kexec_load_disabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		/* only handle a transition from default "0" to "1" */
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ONE,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_MODULES
+ 	{
+ 		.procname	= "modprobe",
+ 		.data		= &modprobe_path,
+ 		.maxlen		= KMOD_PATH_LEN,
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dostring,
+ 	},
+ 	{
+ 		.procname	= "modules_disabled",
+ 		.data		= &modules_disabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		/* only handle a transition from default "0" to "1" */
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ONE,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_UEVENT_HELPER
+ 	{
+ 		.procname	= "hotplug",
+ 		.data		= &uevent_helper,
+ 		.maxlen		= UEVENT_HELPER_PATH_LEN,
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dostring,
+ 	},
+ #endif
+ #ifdef CONFIG_CHR_DEV_SG
+ 	{
+ 		.procname	= "sg-big-buff",
+ 		.data		= &sg_big_buff,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_BSD_PROCESS_ACCT
+ 	{
+ 		.procname	= "acct",
+ 		.data		= &acct_parm,
+ 		.maxlen		= 3*sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_MAGIC_SYSRQ
+ 	{
+ 		.procname	= "sysrq",
+ 		.data		= &__sysrq_enabled,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysrq_sysctl_handler,
+ 	},
+ #endif
+ #ifdef CONFIG_PROC_SYSCTL
+ 	{
+ 		.procname	= "cad_pid",
+ 		.data		= NULL,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_do_cad_pid,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "threads-max",
+ 		.data		= NULL,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_max_threads,
+ 	},
+ 	{
+ 		.procname	= "random",
+ 		.mode		= 0555,
+ 		.child		= random_table,
+ 	},
+ 	{
+ 		.procname	= "usermodehelper",
+ 		.mode		= 0555,
+ 		.child		= usermodehelper_table,
+ 	},
+ #ifdef CONFIG_FW_LOADER_USER_HELPER
+ 	{
+ 		.procname	= "firmware_config",
+ 		.mode		= 0555,
+ 		.child		= firmware_config_table,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "overflowuid",
+ 		.data		= &overflowuid,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &minolduid,
+ 		.extra2		= &maxolduid,
+ 	},
+ 	{
+ 		.procname	= "overflowgid",
+ 		.data		= &overflowgid,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &minolduid,
+ 		.extra2		= &maxolduid,
+ 	},
+ #ifdef CONFIG_S390
+ #ifdef CONFIG_MATHEMU
+ 	{
+ 		.procname	= "ieee_emulation_warnings",
+ 		.data		= &sysctl_ieee_emulation_warnings,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "userprocess_debug",
+ 		.data		= &show_unhandled_signals,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "pid_max",
+ 		.data		= &pid_max,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &pid_max_min,
+ 		.extra2		= &pid_max_max,
+ 	},
+ 	{
+ 		.procname	= "panic_on_oops",
+ 		.data		= &panic_on_oops,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "panic_print",
+ 		.data		= &panic_print,
+ 		.maxlen		= sizeof(unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ #if defined CONFIG_PRINTK
+ 	{
+ 		.procname	= "printk",
+ 		.data		= &console_loglevel,
+ 		.maxlen		= 4*sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "printk_ratelimit",
+ 		.data		= &printk_ratelimit_state.interval,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_jiffies,
+ 	},
+ 	{
+ 		.procname	= "printk_ratelimit_burst",
+ 		.data		= &printk_ratelimit_state.burst,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "printk_delay",
+ 		.data		= &printk_delay_msec,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &ten_thousand,
+ 	},
+ 	{
+ 		.procname	= "printk_devkmsg",
+ 		.data		= devkmsg_log_str,
+ 		.maxlen		= DEVKMSG_STR_MAX_SIZE,
+ 		.mode		= 0644,
+ 		.proc_handler	= devkmsg_sysctl_set_loglvl,
+ 	},
+ 	{
+ 		.procname	= "dmesg_restrict",
+ 		.data		= &dmesg_restrict,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax_sysadmin,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "kptr_restrict",
+ 		.data		= &kptr_restrict,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax_sysadmin,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &two,
+ 	},
+ #endif
+ #if defined CONFIG_TTY
+ 	{
+ 		.procname	= "tiocsti_restrict",
+ 		.data		= &tiocsti_restrict,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax_sysadmin,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "device_sidechannel_restrict",
+ 		.data		= &device_sidechannel_restrict,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax_sysadmin,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #ifdef CONFIG_SYSVIPC
+ 	{
+ 		.procname	= "harden_ipc",
+ 		.data		= &harden_ipc,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec_minmax_sysadmin,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "ngroups_max",
+ 		.data		= &ngroups_max,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "cap_last_cap",
+ 		.data		= (void *)&cap_last_cap,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #if defined(CONFIG_LOCKUP_DETECTOR)
+ 	{
+ 		.procname       = "watchdog",
+ 		.data		= &watchdog_user_enabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler   = proc_watchdog,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "watchdog_thresh",
+ 		.data		= &watchdog_thresh,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_watchdog_thresh,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &sixty,
+ 	},
+ 	{
+ 		.procname       = "nmi_watchdog",
+ 		.data		= &nmi_watchdog_user_enabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
+ 		.proc_handler   = proc_nmi_watchdog,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "watchdog_cpumask",
+ 		.data		= &watchdog_cpumask_bits,
+ 		.maxlen		= NR_CPUS,
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_watchdog_cpumask,
+ 	},
+ #ifdef CONFIG_SOFTLOCKUP_DETECTOR
+ 	{
+ 		.procname       = "soft_watchdog",
+ 		.data		= &soft_watchdog_user_enabled,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler   = proc_soft_watchdog,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "softlockup_panic",
+ 		.data		= &softlockup_panic,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #ifdef CONFIG_SMP
+ 	{
+ 		.procname	= "softlockup_all_cpu_backtrace",
+ 		.data		= &sysctl_softlockup_all_cpu_backtrace,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif /* CONFIG_SMP */
+ #endif
+ #ifdef CONFIG_HARDLOCKUP_DETECTOR
+ 	{
+ 		.procname	= "hardlockup_panic",
+ 		.data		= &hardlockup_panic,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #ifdef CONFIG_SMP
+ 	{
+ 		.procname	= "hardlockup_all_cpu_backtrace",
+ 		.data		= &sysctl_hardlockup_all_cpu_backtrace,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif /* CONFIG_SMP */
+ #endif
+ #endif
+ 
+ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+ 	{
+ 		.procname       = "unknown_nmi_panic",
+ 		.data           = &unknown_nmi_panic,
+ 		.maxlen         = sizeof (int),
+ 		.mode           = 0644,
+ 		.proc_handler   = proc_dointvec,
+ 	},
+ #endif
+ #if defined(CONFIG_X86)
+ 	{
+ 		.procname	= "panic_on_unrecovered_nmi",
+ 		.data		= &panic_on_unrecovered_nmi,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "panic_on_io_nmi",
+ 		.data		= &panic_on_io_nmi,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
+ 	{
+ 		.procname	= "panic_on_stackoverflow",
+ 		.data		= &sysctl_panic_on_stackoverflow,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "bootloader_type",
+ 		.data		= &bootloader_type,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "bootloader_version",
+ 		.data		= &bootloader_version,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "io_delay_type",
+ 		.data		= &io_delay_type,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #if defined(CONFIG_MMU)
+ 	{
+ 		.procname	= "randomize_va_space",
+ 		.data		= &randomize_va_space,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #if defined(CONFIG_S390) && defined(CONFIG_SMP)
+ 	{
+ 		.procname	= "spin_retry",
+ 		.data		= &spin_retry,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #if	defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
+ 	{
+ 		.procname	= "acpi_video_flags",
+ 		.data		= &acpi_realmode_flags,
+ 		.maxlen		= sizeof (unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ #endif
+ #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+ 	{
+ 		.procname	= "ignore-unaligned-usertrap",
+ 		.data		= &no_unaligned_warning,
+ 		.maxlen		= sizeof (int),
+ 	 	.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_IA64
+ 	{
+ 		.procname	= "unaligned-dump-stack",
+ 		.data		= &unaligned_dump_stack,
+ 		.maxlen		= sizeof (int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_DETECT_HUNG_TASK
+ 	{
+ 		.procname	= "hung_task_panic",
+ 		.data		= &sysctl_hung_task_panic,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "hung_task_check_count",
+ 		.data		= &sysctl_hung_task_check_count,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "hung_task_timeout_secs",
+ 		.data		= &sysctl_hung_task_timeout_secs,
+ 		.maxlen		= sizeof(unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dohung_task_timeout_secs,
+ 		.extra2		= &hung_task_timeout_max,
+ 	},
+ 	{
+ 		.procname	= "hung_task_check_interval_secs",
+ 		.data		= &sysctl_hung_task_check_interval_secs,
+ 		.maxlen		= sizeof(unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dohung_task_timeout_secs,
+ 		.extra2		= &hung_task_timeout_max,
+ 	},
+ 	{
+ 		.procname	= "hung_task_warnings",
+ 		.data		= &sysctl_hung_task_warnings,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &neg_one,
+ 	},
+ #endif
+ #ifdef CONFIG_RT_MUTEXES
+ 	{
+ 		.procname	= "max_lock_depth",
+ 		.data		= &max_lock_depth,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "poweroff_cmd",
+ 		.data		= &poweroff_cmd,
+ 		.maxlen		= POWEROFF_CMD_PATH_LEN,
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dostring,
+ 	},
+ #ifdef CONFIG_KEYS
+ 	{
+ 		.procname	= "keys",
+ 		.mode		= 0555,
+ 		.child		= key_sysctls,
+ 	},
+ #endif
+ #ifdef CONFIG_PERF_EVENTS
+ 	/*
+ 	 * User-space scripts rely on the existence of this file
+ 	 * as a feature check for perf_events being enabled.
+ 	 *
+ 	 * So it's an ABI, do not remove!
+ 	 */
+ 	{
+ 		.procname	= "perf_event_paranoid",
+ 		.data		= &sysctl_perf_event_paranoid,
+ 		.maxlen		= sizeof(sysctl_perf_event_paranoid),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "perf_event_mlock_kb",
+ 		.data		= &sysctl_perf_event_mlock,
+ 		.maxlen		= sizeof(sysctl_perf_event_mlock),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "perf_event_max_sample_rate",
+ 		.data		= &sysctl_perf_event_sample_rate,
+ 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
+ 		.mode		= 0644,
+ 		.proc_handler	= perf_proc_update_handler,
+ 		.extra1		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "perf_cpu_time_max_percent",
+ 		.data		= &sysctl_perf_cpu_time_max_percent,
+ 		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent),
+ 		.mode		= 0644,
+ 		.proc_handler	= perf_cpu_time_max_percent_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_hundred,
+ 	},
+ 	{
+ 		.procname	= "perf_event_max_stack",
+ 		.data		= &sysctl_perf_event_max_stack,
+ 		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+ 		.mode		= 0644,
+ 		.proc_handler	= perf_event_max_stack_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &six_hundred_forty_kb,
+ 	},
+ 	{
+ 		.procname	= "perf_event_max_contexts_per_stack",
+ 		.data		= &sysctl_perf_event_max_contexts_per_stack,
+ 		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
+ 		.mode		= 0644,
+ 		.proc_handler	= perf_event_max_stack_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_thousand,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "panic_on_warn",
+ 		.data		= &panic_on_warn,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ 	{
+ 		.procname	= "timer_migration",
+ 		.data		= &sysctl_timer_migration,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= timer_migration_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_BPF_SYSCALL
+ 	{
+ 		.procname	= "unprivileged_bpf_disabled",
+ 		.data		= &sysctl_unprivileged_bpf_disabled,
+ 		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
+ 		.mode		= 0644,
+ 		/* only handle a transition from default "0" to "1" */
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ONE,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "bpf_stats_enabled",
+ 		.data		= &bpf_stats_enabled_key.key,
+ 		.maxlen		= sizeof(bpf_stats_enabled_key),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_do_static_key,
+ 	},
+ #endif
+ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+ 	{
+ 		.procname	= "panic_on_rcu_stall",
+ 		.data		= &sysctl_panic_on_rcu_stall,
+ 		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+ 	{
+ 		.procname	= "stack_erasing",
+ 		.data		= NULL,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= stack_erasing_sysctl,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ 	{ }
+ };
+ 
+ static struct ctl_table vm_table[] = {
+ 	{
+ 		.procname	= "overcommit_memory",
+ 		.data		= &sysctl_overcommit_memory,
+ 		.maxlen		= sizeof(sysctl_overcommit_memory),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &two,
+ 	},
+ 	{
+ 		.procname	= "panic_on_oom",
+ 		.data		= &sysctl_panic_on_oom,
+ 		.maxlen		= sizeof(sysctl_panic_on_oom),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &two,
+ 	},
+ 	{
+ 		.procname	= "oom_kill_allocating_task",
+ 		.data		= &sysctl_oom_kill_allocating_task,
+ 		.maxlen		= sizeof(sysctl_oom_kill_allocating_task),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "oom_dump_tasks",
+ 		.data		= &sysctl_oom_dump_tasks,
+ 		.maxlen		= sizeof(sysctl_oom_dump_tasks),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
+ 		.procname	= "overcommit_ratio",
+ 		.data		= &sysctl_overcommit_ratio,
+ 		.maxlen		= sizeof(sysctl_overcommit_ratio),
+ 		.mode		= 0644,
+ 		.proc_handler	= overcommit_ratio_handler,
+ 	},
+ 	{
+ 		.procname	= "overcommit_kbytes",
+ 		.data		= &sysctl_overcommit_kbytes,
+ 		.maxlen		= sizeof(sysctl_overcommit_kbytes),
+ 		.mode		= 0644,
+ 		.proc_handler	= overcommit_kbytes_handler,
+ 	},
+ 	{
+ 		.procname	= "page-cluster", 
+ 		.data		= &page_cluster,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "dirty_background_ratio",
+ 		.data		= &dirty_background_ratio,
+ 		.maxlen		= sizeof(dirty_background_ratio),
+ 		.mode		= 0644,
+ 		.proc_handler	= dirty_background_ratio_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_hundred,
+ 	},
+ 	{
+ 		.procname	= "dirty_background_bytes",
+ 		.data		= &dirty_background_bytes,
+ 		.maxlen		= sizeof(dirty_background_bytes),
+ 		.mode		= 0644,
+ 		.proc_handler	= dirty_background_bytes_handler,
+ 		.extra1		= &one_ul,
+ 	},
+ 	{
+ 		.procname	= "dirty_ratio",
+ 		.data		= &vm_dirty_ratio,
+ 		.maxlen		= sizeof(vm_dirty_ratio),
+ 		.mode		= 0644,
+ 		.proc_handler	= dirty_ratio_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_hundred,
+ 	},
+ 	{
+ 		.procname	= "dirty_bytes",
+ 		.data		= &vm_dirty_bytes,
+ 		.maxlen		= sizeof(vm_dirty_bytes),
+ 		.mode		= 0644,
+ 		.proc_handler	= dirty_bytes_handler,
+ 		.extra1		= &dirty_bytes_min,
+ 	},
+ 	{
+ 		.procname	= "dirty_writeback_centisecs",
+ 		.data		= &dirty_writeback_interval,
+ 		.maxlen		= sizeof(dirty_writeback_interval),
+ 		.mode		= 0644,
+ 		.proc_handler	= dirty_writeback_centisecs_handler,
+ 	},
+ 	{
+ 		.procname	= "dirty_expire_centisecs",
+ 		.data		= &dirty_expire_interval,
+ 		.maxlen		= sizeof(dirty_expire_interval),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "dirtytime_expire_seconds",
+ 		.data		= &dirtytime_expire_interval,
+ 		.maxlen		= sizeof(dirtytime_expire_interval),
+ 		.mode		= 0644,
+ 		.proc_handler	= dirtytime_interval_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "swappiness",
+ 		.data		= &vm_swappiness,
+ 		.maxlen		= sizeof(vm_swappiness),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_hundred,
+ 	},
+ #ifdef CONFIG_HUGETLB_PAGE
+ 	{
+ 		.procname	= "nr_hugepages",
+ 		.data		= NULL,
+ 		.maxlen		= sizeof(unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= hugetlb_sysctl_handler,
+ 	},
+ #ifdef CONFIG_NUMA
+ 	{
+ 		.procname       = "nr_hugepages_mempolicy",
+ 		.data           = NULL,
+ 		.maxlen         = sizeof(unsigned long),
+ 		.mode           = 0644,
+ 		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
+ 	},
+ 	{
+ 		.procname		= "numa_stat",
+ 		.data			= &sysctl_vm_numa_stat,
+ 		.maxlen			= sizeof(int),
+ 		.mode			= 0644,
+ 		.proc_handler	= sysctl_vm_numa_stat_handler,
+ 		.extra1			= SYSCTL_ZERO,
+ 		.extra2			= SYSCTL_ONE,
+ 	},
+ #endif
+ 	 {
+ 		.procname	= "hugetlb_shm_group",
+ 		.data		= &sysctl_hugetlb_shm_group,
+ 		.maxlen		= sizeof(gid_t),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	 },
+ 	{
+ 		.procname	= "nr_overcommit_hugepages",
+ 		.data		= NULL,
+ 		.maxlen		= sizeof(unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= hugetlb_overcommit_handler,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "lowmem_reserve_ratio",
+ 		.data		= &sysctl_lowmem_reserve_ratio,
+ 		.maxlen		= sizeof(sysctl_lowmem_reserve_ratio),
+ 		.mode		= 0644,
+ 		.proc_handler	= lowmem_reserve_ratio_sysctl_handler,
+ 	},
+ 	{
+ 		.procname	= "drop_caches",
+ 		.data		= &sysctl_drop_caches,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0200,
+ 		.proc_handler	= drop_caches_sysctl_handler,
+ 		.extra1		= SYSCTL_ONE,
+ 		.extra2		= &four,
+ 	},
+ #ifdef CONFIG_COMPACTION
+ 	{
+ 		.procname	= "compact_memory",
+ 		.data		= &sysctl_compact_memory,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0200,
+ 		.proc_handler	= sysctl_compaction_handler,
+ 	},
+ 	{
+ 		.procname	= "extfrag_threshold",
+ 		.data		= &sysctl_extfrag_threshold,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &min_extfrag_threshold,
+ 		.extra2		= &max_extfrag_threshold,
+ 	},
+ 	{
+ 		.procname	= "compact_unevictable_allowed",
+ 		.data		= &sysctl_compact_unevictable_allowed,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 
+ #endif /* CONFIG_COMPACTION */
+ 	{
+ 		.procname	= "min_free_kbytes",
+ 		.data		= &min_free_kbytes,
+ 		.maxlen		= sizeof(min_free_kbytes),
+ 		.mode		= 0644,
+ 		.proc_handler	= min_free_kbytes_sysctl_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "watermark_boost_factor",
+ 		.data		= &watermark_boost_factor,
+ 		.maxlen		= sizeof(watermark_boost_factor),
+ 		.mode		= 0644,
+ 		.proc_handler	= watermark_boost_factor_sysctl_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "watermark_scale_factor",
+ 		.data		= &watermark_scale_factor,
+ 		.maxlen		= sizeof(watermark_scale_factor),
+ 		.mode		= 0644,
+ 		.proc_handler	= watermark_scale_factor_sysctl_handler,
+ 		.extra1		= SYSCTL_ONE,
+ 		.extra2		= &one_thousand,
+ 	},
+ 	{
+ 		.procname	= "percpu_pagelist_fraction",
+ 		.data		= &percpu_pagelist_fraction,
+ 		.maxlen		= sizeof(percpu_pagelist_fraction),
+ 		.mode		= 0644,
+ 		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #ifdef CONFIG_MMU
+ 	{
+ 		.procname	= "max_map_count",
+ 		.data		= &sysctl_max_map_count,
+ 		.maxlen		= sizeof(sysctl_max_map_count),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #else
+ 	{
+ 		.procname	= "nr_trim_pages",
+ 		.data		= &sysctl_nr_trim_pages,
+ 		.maxlen		= sizeof(sysctl_nr_trim_pages),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "laptop_mode",
+ 		.data		= &laptop_mode,
+ 		.maxlen		= sizeof(laptop_mode),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_jiffies,
+ 	},
+ 	{
+ 		.procname	= "block_dump",
+ 		.data		= &block_dump,
+ 		.maxlen		= sizeof(block_dump),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "vfs_cache_pressure",
+ 		.data		= &sysctl_vfs_cache_pressure,
+ 		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+     defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+ 	{
+ 		.procname	= "legacy_va_layout",
+ 		.data		= &sysctl_legacy_va_layout,
+ 		.maxlen		= sizeof(sysctl_legacy_va_layout),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #endif
+ #ifdef CONFIG_NUMA
+ 	{
+ 		.procname	= "zone_reclaim_mode",
+ 		.data		= &node_reclaim_mode,
+ 		.maxlen		= sizeof(node_reclaim_mode),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ 	{
+ 		.procname	= "min_unmapped_ratio",
+ 		.data		= &sysctl_min_unmapped_ratio,
+ 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_min_unmapped_ratio_sysctl_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_hundred,
+ 	},
+ 	{
+ 		.procname	= "min_slab_ratio",
+ 		.data		= &sysctl_min_slab_ratio,
+ 		.maxlen		= sizeof(sysctl_min_slab_ratio),
+ 		.mode		= 0644,
+ 		.proc_handler	= sysctl_min_slab_ratio_sysctl_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &one_hundred,
+ 	},
+ #endif
+ #ifdef CONFIG_SMP
+ 	{
+ 		.procname	= "stat_interval",
+ 		.data		= &sysctl_stat_interval,
+ 		.maxlen		= sizeof(sysctl_stat_interval),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_jiffies,
+ 	},
+ 	{
+ 		.procname	= "stat_refresh",
+ 		.data		= NULL,
+ 		.maxlen		= 0,
+ 		.mode		= 0600,
+ 		.proc_handler	= vmstat_refresh,
+ 	},
+ #endif
+ #ifdef CONFIG_MMU
+ 	{
+ 		.procname	= "mmap_min_addr",
+ 		.data		= &dac_mmap_min_addr,
+ 		.maxlen		= sizeof(unsigned long),
+ 		.mode		= 0644,
+ 		.proc_handler	= mmap_min_addr_handler,
+ 	},
+ #endif
+ #ifdef CONFIG_NUMA
+ 	{
+ 		.procname	= "numa_zonelist_order",
+ 		.data		= &numa_zonelist_order,
+ 		.maxlen		= NUMA_ZONELIST_ORDER_LEN,
+ 		.mode		= 0644,
+ 		.proc_handler	= numa_zonelist_order_handler,
+ 	},
+ #endif
+ #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
+    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
+ 	{
+ 		.procname	= "vdso_enabled",
+ #ifdef CONFIG_X86_32
+ 		.data		= &vdso32_enabled,
+ 		.maxlen		= sizeof(vdso32_enabled),
+ #else
+ 		.data		= &vdso_enabled,
+ 		.maxlen		= sizeof(vdso_enabled),
+ #endif
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #endif
+ #ifdef CONFIG_HIGHMEM
+ 	{
+ 		.procname	= "highmem_is_dirtyable",
+ 		.data		= &vm_highmem_is_dirtyable,
+ 		.maxlen		= sizeof(vm_highmem_is_dirtyable),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ #ifdef CONFIG_MEMORY_FAILURE
+ 	{
+ 		.procname	= "memory_failure_early_kill",
+ 		.data		= &sysctl_memory_failure_early_kill,
+ 		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "memory_failure_recovery",
+ 		.data		= &sysctl_memory_failure_recovery,
+ 		.maxlen		= sizeof(sysctl_memory_failure_recovery),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "user_reserve_kbytes",
+ 		.data		= &sysctl_user_reserve_kbytes,
+ 		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ 	{
+ 		.procname	= "admin_reserve_kbytes",
+ 		.data		= &sysctl_admin_reserve_kbytes,
+ 		.maxlen		= sizeof(sysctl_admin_reserve_kbytes),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ 	{
+ 		.procname	= "mmap_rnd_bits",
+ 		.data		= &mmap_rnd_bits,
+ 		.maxlen		= sizeof(mmap_rnd_bits),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= (void *)&mmap_rnd_bits_min,
+ 		.extra2		= (void *)&mmap_rnd_bits_max,
+ 	},
+ #endif
+ #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ 	{
+ 		.procname	= "mmap_rnd_compat_bits",
+ 		.data		= &mmap_rnd_compat_bits,
+ 		.maxlen		= sizeof(mmap_rnd_compat_bits),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= (void *)&mmap_rnd_compat_bits_min,
+ 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
+ 	},
+ #endif
+ #ifdef CONFIG_USERFAULTFD
+ 	{
+ 		.procname	= "unprivileged_userfaultfd",
+ 		.data		= &sysctl_unprivileged_userfaultfd,
+ 		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ 	{ }
+ };
+ 
+ static struct ctl_table fs_table[] = {
+ 	{
+ 		.procname	= "inode-nr",
+ 		.data		= &inodes_stat,
+ 		.maxlen		= 2*sizeof(long),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_nr_inodes,
+ 	},
+ 	{
+ 		.procname	= "inode-state",
+ 		.data		= &inodes_stat,
+ 		.maxlen		= 7*sizeof(long),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_nr_inodes,
+ 	},
+ 	{
+ 		.procname	= "file-nr",
+ 		.data		= &files_stat,
+ 		.maxlen		= sizeof(files_stat),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_nr_files,
+ 	},
+ 	{
+ 		.procname	= "file-max",
+ 		.data		= &files_stat.max_files,
+ 		.maxlen		= sizeof(files_stat.max_files),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 		.extra1		= &zero_ul,
+ 		.extra2		= &long_max,
+ 	},
+ 	{
+ 		.procname	= "nr_open",
+ 		.data		= &sysctl_nr_open,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &sysctl_nr_open_min,
+ 		.extra2		= &sysctl_nr_open_max,
+ 	},
+ 	{
+ 		.procname	= "dentry-state",
+ 		.data		= &dentry_stat,
+ 		.maxlen		= 6*sizeof(long),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_nr_dentry,
+ 	},
+ 	{
+ 		.procname	= "overflowuid",
+ 		.data		= &fs_overflowuid,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &minolduid,
+ 		.extra2		= &maxolduid,
+ 	},
+ 	{
+ 		.procname	= "overflowgid",
+ 		.data		= &fs_overflowgid,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= &minolduid,
+ 		.extra2		= &maxolduid,
+ 	},
+ #ifdef CONFIG_FILE_LOCKING
+ 	{
+ 		.procname	= "leases-enable",
+ 		.data		= &leases_enable,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_DNOTIFY
+ 	{
+ 		.procname	= "dir-notify-enable",
+ 		.data		= &dir_notify_enable,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_MMU
+ #ifdef CONFIG_FILE_LOCKING
+ 	{
+ 		.procname	= "lease-break-time",
+ 		.data		= &lease_break_time,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
+ #ifdef CONFIG_AIO
+ 	{
+ 		.procname	= "aio-nr",
+ 		.data		= &aio_nr,
+ 		.maxlen		= sizeof(aio_nr),
+ 		.mode		= 0444,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ 	{
+ 		.procname	= "aio-max-nr",
+ 		.data		= &aio_max_nr,
+ 		.maxlen		= sizeof(aio_max_nr),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ #endif /* CONFIG_AIO */
+ #ifdef CONFIG_INOTIFY_USER
+ 	{
+ 		.procname	= "inotify",
+ 		.mode		= 0555,
+ 		.child		= inotify_table,
+ 	},
+ #endif	
+ #ifdef CONFIG_EPOLL
+ 	{
+ 		.procname	= "epoll",
+ 		.mode		= 0555,
+ 		.child		= epoll_table,
+ 	},
+ #endif
+ #endif
+ 	{
+ 		.procname	= "protected_symlinks",
+ 		.data		= &sysctl_protected_symlinks,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "protected_hardlinks",
+ 		.data		= &sysctl_protected_hardlinks,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ 	{
+ 		.procname	= "protected_fifos",
+ 		.data		= &sysctl_protected_fifos,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &two,
+ 	},
+ 	{
+ 		.procname	= "protected_regular",
+ 		.data		= &sysctl_protected_regular,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &two,
+ 	},
+ 	{
+ 		.procname	= "suid_dumpable",
+ 		.data		= &suid_dumpable,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax_coredump,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= &two,
+ 	},
+ #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+ 	{
+ 		.procname	= "binfmt_misc",
+ 		.mode		= 0555,
+ 		.child		= sysctl_mount_point,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "pipe-max-size",
+ 		.data		= &pipe_max_size,
+ 		.maxlen		= sizeof(pipe_max_size),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dopipe_max_size,
+ 	},
+ 	{
+ 		.procname	= "pipe-user-pages-hard",
+ 		.data		= &pipe_user_pages_hard,
+ 		.maxlen		= sizeof(pipe_user_pages_hard),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ 	{
+ 		.procname	= "pipe-user-pages-soft",
+ 		.data		= &pipe_user_pages_soft,
+ 		.maxlen		= sizeof(pipe_user_pages_soft),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_doulongvec_minmax,
+ 	},
+ 	{
+ 		.procname	= "mount-max",
+ 		.data		= &sysctl_mount_max,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec_minmax,
+ 		.extra1		= SYSCTL_ONE,
+ 	},
+ 	{ }
+ };
+ 
+ static struct ctl_table debug_table[] = {
+ #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
+ 	{
+ 		.procname	= "exception-trace",
+ 		.data		= &show_unhandled_signals,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_dointvec
+ 	},
+ #endif
+ #if defined(CONFIG_OPTPROBES)
+ 	{
+ 		.procname	= "kprobes-optimization",
+ 		.data		= &sysctl_kprobes_optimization,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0644,
+ 		.proc_handler	= proc_kprobes_optimization_handler,
+ 		.extra1		= SYSCTL_ZERO,
+ 		.extra2		= SYSCTL_ONE,
+ 	},
+ #endif
+ 	{ }
+ };
+ 
+ static struct ctl_table dev_table[] = {
+ 	{ }
+ };
+ 
+ int __init sysctl_init(void)
+ {
+ 	struct ctl_table_header *hdr;
+ 
+ 	hdr = register_sysctl_table(sysctl_base_table);
+ 	kmemleak_not_leak(hdr);
+ 	return 0;
+ }
+ 
+ #endif /* CONFIG_SYSCTL */
+ 
+ /*
+  * /proc/sys support
+  */
+ 
+ #ifdef CONFIG_PROC_SYSCTL
+ 
+ static int _proc_do_string(char *data, int maxlen, int write,
+ 			   char __user *buffer,
+ 			   size_t *lenp, loff_t *ppos)
+ {
+ 	size_t len;
+ 	char __user *p;
+ 	char c;
+ 
+ 	if (!data || !maxlen || !*lenp) {
+ 		*lenp = 0;
+ 		return 0;
+ 	}
+ 
+ 	if (write) {
+ 		if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+ 			/* Only continue writes not past the end of buffer. */
+ 			len = strlen(data);
+ 			if (len > maxlen - 1)
+ 				len = maxlen - 1;
+ 
+ 			if (*ppos > len)
+ 				return 0;
+ 			len = *ppos;
+ 		} else {
+ 			/* Start writing from beginning of buffer. */
+ 			len = 0;
+ 		}
+ 
+ 		*ppos += *lenp;
+ 		p = buffer;
+ 		while ((p - buffer) < *lenp && len < maxlen - 1) {
+ 			if (get_user(c, p++))
+ 				return -EFAULT;
+ 			if (c == 0 || c == '\n')
+ 				break;
+ 			data[len++] = c;
+ 		}
+ 		data[len] = 0;
+ 	} else {
+ 		len = strlen(data);
+ 		if (len > maxlen)
+ 			len = maxlen;
+ 
+ 		if (*ppos > len) {
+ 			*lenp = 0;
+ 			return 0;
+ 		}
+ 
+ 		data += *ppos;
+ 		len  -= *ppos;
+ 
+ 		if (len > *lenp)
+ 			len = *lenp;
+ 		if (len)
+ 			if (copy_to_user(buffer, data, len))
+ 				return -EFAULT;
+ 		if (len < *lenp) {
+ 			if (put_user('\n', buffer + len))
+ 				return -EFAULT;
+ 			len++;
+ 		}
+ 		*lenp = len;
+ 		*ppos += len;
+ 	}
+ 	return 0;
+ }
+ 
+ static void warn_sysctl_write(struct ctl_table *table)
+ {
+ 	pr_warn_once("%s wrote to %s when file position was not 0!\n"
+ 		"This will not be supported in the future. To silence this\n"
+ 		"warning, set kernel.sysctl_writes_strict = -1\n",
+ 		current->comm, table->procname);
+ }
+ 
+ /**
+  * proc_first_pos_non_zero_ignore - check if first position is allowed
+  * @ppos: file position
+  * @table: the sysctl table
+  *
+  * Returns true if the first position is non-zero and the sysctl_writes_strict
+  * mode indicates this is not allowed for numeric input types. String proc
+  * handlers can ignore the return value.
+  */
+ static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ 					   struct ctl_table *table)
+ {
+ 	if (!*ppos)
+ 		return false;
+ 
+ 	switch (sysctl_writes_strict) {
+ 	case SYSCTL_WRITES_STRICT:
+ 		return true;
+ 	case SYSCTL_WRITES_WARN:
+ 		warn_sysctl_write(table);
+ 		return false;
+ 	default:
+ 		return false;
+ 	}
+ }
+ 
+ /**
+  * proc_dostring - read a string sysctl
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes a string from/to the user buffer. If the kernel
+  * buffer provided is not large enough to hold the string, the
+  * string is truncated. The copied string is %NULL-terminated.
+  * If the string is being read by the user process, it is copied
+  * and a newline '\n' is added. It is truncated if the buffer is
+  * not large enough.
+  *
+  * Returns 0 on success.
+  */
+ int proc_dostring(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	if (write)
+ 		proc_first_pos_non_zero_ignore(ppos, table);
+ 
+ 	return _proc_do_string((char *)(table->data), table->maxlen, write,
+ 			       (char __user *)buffer, lenp, ppos);
+ }
+ 
+ static size_t proc_skip_spaces(char **buf)
+ {
+ 	size_t ret;
+ 	char *tmp = skip_spaces(*buf);
+ 	ret = tmp - *buf;
+ 	*buf = tmp;
+ 	return ret;
+ }
+ 
+ static void proc_skip_char(char **buf, size_t *size, const char v)
+ {
+ 	while (*size) {
+ 		if (**buf != v)
+ 			break;
+ 		(*size)--;
+ 		(*buf)++;
+ 	}
+ }
+ 
+ /**
+  * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+  *                   fail on overflow
+  *
+  * @cp: kernel buffer containing the string to parse
+  * @endp: pointer to store the trailing characters
+  * @base: the base to use
+  * @res: where the parsed integer will be stored
+  *
+  * In case of success 0 is returned and @res will contain the parsed integer,
+  * @endp will hold any trailing characters.
+  * This function will fail the parse on overflow. If there wasn't an overflow
+  * the function will defer the decision what characters count as invalid to the
+  * caller.
+  */
+ static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ 			   unsigned long *res)
+ {
+ 	unsigned long long result;
+ 	unsigned int rv;
+ 
+ 	cp = _parse_integer_fixup_radix(cp, &base);
+ 	rv = _parse_integer(cp, base, &result);
+ 	if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ 		return -ERANGE;
+ 
+ 	cp += rv;
+ 
+ 	if (endp)
+ 		*endp = (char *)cp;
+ 
+ 	*res = (unsigned long)result;
+ 	return 0;
+ }
+ 
+ #define TMPBUFLEN 22
+ /**
+  * proc_get_long - reads an ASCII formatted integer from a user buffer
+  *
+  * @buf: a kernel buffer
+  * @size: size of the kernel buffer
+  * @val: this is where the number will be stored
+  * @neg: set to %TRUE if number is negative
+  * @perm_tr: a vector which contains the allowed trailers
+  * @perm_tr_len: size of the perm_tr vector
+  * @tr: pointer to store the trailer character
+  *
+  * In case of success %0 is returned and @buf and @size are updated with
+  * the amount of bytes read. If @tr is non-NULL and a trailing
+  * character exists (size is non-zero after returning from this
+  * function), @tr is updated with the trailing character.
+  */
+ static int proc_get_long(char **buf, size_t *size,
+ 			  unsigned long *val, bool *neg,
+ 			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+ {
+ 	int len;
+ 	char *p, tmp[TMPBUFLEN];
+ 
+ 	if (!*size)
+ 		return -EINVAL;
+ 
+ 	len = *size;
+ 	if (len > TMPBUFLEN - 1)
+ 		len = TMPBUFLEN - 1;
+ 
+ 	memcpy(tmp, *buf, len);
+ 
+ 	tmp[len] = 0;
+ 	p = tmp;
+ 	if (*p == '-' && *size > 1) {
+ 		*neg = true;
+ 		p++;
+ 	} else
+ 		*neg = false;
+ 	if (!isdigit(*p))
+ 		return -EINVAL;
+ 
+ 	if (strtoul_lenient(p, &p, 0, val))
+ 		return -EINVAL;
+ 
+ 	len = p - tmp;
+ 
+ 	/* We don't know if the next char is whitespace thus we may accept
+ 	 * invalid integers (e.g. 1234...a) or two integers instead of one
+ 	 * (e.g. 123...1). So lets not allow such large numbers. */
+ 	if (len == TMPBUFLEN - 1)
+ 		return -EINVAL;
+ 
+ 	if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+ 		return -EINVAL;
+ 
+ 	if (tr && (len < *size))
+ 		*tr = *p;
+ 
+ 	*buf += len;
+ 	*size -= len;
+ 
+ 	return 0;
+ }
+ 
+ /**
+  * proc_put_long - converts an integer to a decimal ASCII formatted string
+  *
+  * @buf: the user buffer
+  * @size: the size of the user buffer
+  * @val: the integer to be converted
+  * @neg: sign of the number, %TRUE for negative
+  *
+  * In case of success %0 is returned and @buf and @size are updated with
+  * the amount of bytes written.
+  */
+ static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+ 			  bool neg)
+ {
+ 	int len;
+ 	char tmp[TMPBUFLEN], *p = tmp;
+ 
+ 	sprintf(p, "%s%lu", neg ? "-" : "", val);
+ 	len = strlen(tmp);
+ 	if (len > *size)
+ 		len = *size;
+ 	if (copy_to_user(*buf, tmp, len))
+ 		return -EFAULT;
+ 	*size -= len;
+ 	*buf += len;
+ 	return 0;
+ }
+ #undef TMPBUFLEN
+ 
+ static int proc_put_char(void __user **buf, size_t *size, char c)
+ {
+ 	if (*size) {
+ 		char __user **buffer = (char __user **)buf;
+ 		if (put_user(c, *buffer))
+ 			return -EFAULT;
+ 		(*size)--, (*buffer)++;
+ 		*buf = *buffer;
+ 	}
+ 	return 0;
+ }
+ 
+ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ 				 int *valp,
+ 				 int write, void *data)
+ {
+ 	if (write) {
+ 		if (*negp) {
+ 			if (*lvalp > (unsigned long) INT_MAX + 1)
+ 				return -EINVAL;
+ 			*valp = -*lvalp;
+ 		} else {
+ 			if (*lvalp > (unsigned long) INT_MAX)
+ 				return -EINVAL;
+ 			*valp = *lvalp;
+ 		}
+ 	} else {
+ 		int val = *valp;
+ 		if (val < 0) {
+ 			*negp = true;
+ 			*lvalp = -(unsigned long)val;
+ 		} else {
+ 			*negp = false;
+ 			*lvalp = (unsigned long)val;
+ 		}
+ 	}
+ 	return 0;
+ }
+ 
+ static int do_proc_douintvec_conv(unsigned long *lvalp,
+ 				  unsigned int *valp,
+ 				  int write, void *data)
+ {
+ 	if (write) {
+ 		if (*lvalp > UINT_MAX)
+ 			return -EINVAL;
+ 		*valp = *lvalp;
+ 	} else {
+ 		unsigned int val = *valp;
+ 		*lvalp = (unsigned long)val;
+ 	}
+ 	return 0;
+ }
+ 
+ static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+ 
+ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
+ 		  int write, void __user *buffer,
+ 		  size_t *lenp, loff_t *ppos,
+ 		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ 			      int write, void *data),
+ 		  void *data)
+ {
+ 	int *i, vleft, first = 1, err = 0;
+ 	size_t left;
+ 	char *kbuf = NULL, *p;
+ 	
+ 	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ 		*lenp = 0;
+ 		return 0;
+ 	}
+ 	
+ 	i = (int *) tbl_data;
+ 	vleft = table->maxlen / sizeof(*i);
+ 	left = *lenp;
+ 
+ 	if (!conv)
+ 		conv = do_proc_dointvec_conv;
+ 
+ 	if (write) {
+ 		if (proc_first_pos_non_zero_ignore(ppos, table))
+ 			goto out;
+ 
+ 		if (left > PAGE_SIZE - 1)
+ 			left = PAGE_SIZE - 1;
+ 		p = kbuf = memdup_user_nul(buffer, left);
+ 		if (IS_ERR(kbuf))
+ 			return PTR_ERR(kbuf);
+ 	}
+ 
+ 	for (; left && vleft--; i++, first=0) {
+ 		unsigned long lval;
+ 		bool neg;
+ 
+ 		if (write) {
+ 			left -= proc_skip_spaces(&p);
+ 
+ 			if (!left)
+ 				break;
+ 			err = proc_get_long(&p, &left, &lval, &neg,
+ 					     proc_wspace_sep,
+ 					     sizeof(proc_wspace_sep), NULL);
+ 			if (err)
+ 				break;
+ 			if (conv(&neg, &lval, i, 1, data)) {
+ 				err = -EINVAL;
+ 				break;
+ 			}
+ 		} else {
+ 			if (conv(&neg, &lval, i, 0, data)) {
+ 				err = -EINVAL;
+ 				break;
+ 			}
+ 			if (!first)
+ 				err = proc_put_char(&buffer, &left, '\t');
+ 			if (err)
+ 				break;
+ 			err = proc_put_long(&buffer, &left, lval, neg);
+ 			if (err)
+ 				break;
+ 		}
+ 	}
+ 
+ 	if (!write && !first && left && !err)
+ 		err = proc_put_char(&buffer, &left, '\n');
+ 	if (write && !err && left)
+ 		left -= proc_skip_spaces(&p);
+ 	if (write) {
+ 		kfree(kbuf);
+ 		if (first)
+ 			return err ? : -EINVAL;
+ 	}
+ 	*lenp -= left;
+ out:
+ 	*ppos += *lenp;
+ 	return err;
+ }
+ 
+ static int do_proc_dointvec(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos,
+ 		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
+ 			      int write, void *data),
+ 		  void *data)
+ {
+ 	return __do_proc_dointvec(table->data, table, write,
+ 			buffer, lenp, ppos, conv, data);
+ }
+ 
+ static int do_proc_douintvec_w(unsigned int *tbl_data,
+ 			       struct ctl_table *table,
+ 			       void __user *buffer,
+ 			       size_t *lenp, loff_t *ppos,
+ 			       int (*conv)(unsigned long *lvalp,
+ 					   unsigned int *valp,
+ 					   int write, void *data),
+ 			       void *data)
+ {
+ 	unsigned long lval;
+ 	int err = 0;
+ 	size_t left;
+ 	bool neg;
+ 	char *kbuf = NULL, *p;
+ 
+ 	left = *lenp;
+ 
+ 	if (proc_first_pos_non_zero_ignore(ppos, table))
+ 		goto bail_early;
+ 
+ 	if (left > PAGE_SIZE - 1)
+ 		left = PAGE_SIZE - 1;
+ 
+ 	p = kbuf = memdup_user_nul(buffer, left);
+ 	if (IS_ERR(kbuf))
+ 		return -EINVAL;
+ 
+ 	left -= proc_skip_spaces(&p);
+ 	if (!left) {
+ 		err = -EINVAL;
+ 		goto out_free;
+ 	}
+ 
+ 	err = proc_get_long(&p, &left, &lval, &neg,
+ 			     proc_wspace_sep,
+ 			     sizeof(proc_wspace_sep), NULL);
+ 	if (err || neg) {
+ 		err = -EINVAL;
+ 		goto out_free;
+ 	}
+ 
+ 	if (conv(&lval, tbl_data, 1, data)) {
+ 		err = -EINVAL;
+ 		goto out_free;
+ 	}
+ 
+ 	if (!err && left)
+ 		left -= proc_skip_spaces(&p);
+ 
+ out_free:
+ 	kfree(kbuf);
+ 	if (err)
+ 		return -EINVAL;
+ 
+ 	return 0;
+ 
+ 	/* This is in keeping with old __do_proc_dointvec() */
+ bail_early:
+ 	*ppos += *lenp;
+ 	return err;
+ }
+ 
+ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
+ 			       size_t *lenp, loff_t *ppos,
+ 			       int (*conv)(unsigned long *lvalp,
+ 					   unsigned int *valp,
+ 					   int write, void *data),
+ 			       void *data)
+ {
+ 	unsigned long lval;
+ 	int err = 0;
+ 	size_t left;
+ 
+ 	left = *lenp;
+ 
+ 	if (conv(&lval, tbl_data, 0, data)) {
+ 		err = -EINVAL;
+ 		goto out;
+ 	}
+ 
+ 	err = proc_put_long(&buffer, &left, lval, false);
+ 	if (err || !left)
+ 		goto out;
+ 
+ 	err = proc_put_char(&buffer, &left, '\n');
+ 
+ out:
+ 	*lenp -= left;
+ 	*ppos += *lenp;
+ 
+ 	return err;
+ }
+ 
+ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ 			       int write, void __user *buffer,
+ 			       size_t *lenp, loff_t *ppos,
+ 			       int (*conv)(unsigned long *lvalp,
+ 					   unsigned int *valp,
+ 					   int write, void *data),
+ 			       void *data)
+ {
+ 	unsigned int *i, vleft;
+ 
+ 	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ 		*lenp = 0;
+ 		return 0;
+ 	}
+ 
+ 	i = (unsigned int *) tbl_data;
+ 	vleft = table->maxlen / sizeof(*i);
+ 
+ 	/*
+ 	 * Arrays are not supported, keep this simple. *Do not* add
+ 	 * support for them.
+ 	 */
+ 	if (vleft != 1) {
+ 		*lenp = 0;
+ 		return -EINVAL;
+ 	}
+ 
+ 	if (!conv)
+ 		conv = do_proc_douintvec_conv;
+ 
+ 	if (write)
+ 		return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ 					   conv, data);
+ 	return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+ }
+ 
+ static int do_proc_douintvec(struct ctl_table *table, int write,
+ 			     void __user *buffer, size_t *lenp, loff_t *ppos,
+ 			     int (*conv)(unsigned long *lvalp,
+ 					 unsigned int *valp,
+ 					 int write, void *data),
+ 			     void *data)
+ {
+ 	return __do_proc_douintvec(table->data, table, write,
+ 				   buffer, lenp, ppos, conv, data);
+ }
+ 
+ /**
+  * proc_dointvec - read a vector of integers
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+  * values from/to the user buffer, treated as an ASCII string. 
+  *
+  * Returns 0 on success.
+  */
+ int proc_dointvec(struct ctl_table *table, int write,
+ 		     void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+ }
+ 
+ /**
+  * proc_douintvec - read a vector of unsigned integers
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+  * values from/to the user buffer, treated as an ASCII string.
+  *
+  * Returns 0 on success.
+  */
+ int proc_douintvec(struct ctl_table *table, int write,
+ 		     void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ 				 do_proc_douintvec_conv, NULL);
+ }
+ 
+ /*
+  * Taint values can only be increased
+  * This means we can safely use a temporary.
+  */
+ static int proc_taint(struct ctl_table *table, int write,
+ 			       void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	struct ctl_table t;
+ 	unsigned long tmptaint = get_taint();
+ 	int err;
+ 
+ 	if (write && !capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+ 
+ 	t = *table;
+ 	t.data = &tmptaint;
+ 	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ 	if (err < 0)
+ 		return err;
+ 
+ 	if (write) {
+ 		/*
+ 		 * Poor man's atomic or. Not worth adding a primitive
+ 		 * to everyone's atomic.h for this
+ 		 */
+ 		int i;
+ 		for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+ 			if ((tmptaint >> i) & 1)
+ 				add_taint(i, LOCKDEP_STILL_OK);
+ 		}
+ 	}
+ 
+ 	return err;
+ }
+ 
+ /**
+  * proc_dointvec_minmax_sysadmin - read a vector of integers with min/max values
+  * checking CAP_SYS_ADMIN on write
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+  * values from/to the user buffer, treated as an ASCII string.
+  *
+  * This routine will ensure the values are within the range specified by
+  * table->extra1 (min) and table->extra2 (max).
+  *
+  * Writing is only allowed when root has CAP_SYS_ADMIN.
+  *
+  * Returns 0 on success, -EPERM on permission failure or -EINVAL on write
+  * when the range check fails.
+  */
+ int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ 				void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	if (write && !capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+ 
+ 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ }
+ 
+ /**
+  * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+  * @min: pointer to minimum allowable value
+  * @max: pointer to maximum allowable value
+  *
+  * The do_proc_dointvec_minmax_conv_param structure provides the
+  * minimum and maximum values for doing range checking for those sysctl
+  * parameters that use the proc_dointvec_minmax() handler.
+  */
+ struct do_proc_dointvec_minmax_conv_param {
+ 	int *min;
+ 	int *max;
+ };
+ 
+ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+ 					int *valp,
+ 					int write, void *data)
+ {
+ 	int tmp, ret;
+ 	struct do_proc_dointvec_minmax_conv_param *param = data;
+ 	/*
+ 	 * If writing, first do so via a temporary local int so we can
+ 	 * bounds-check it before touching *valp.
+ 	 */
+ 	int *ip = write ? &tmp : valp;
+ 
+ 	ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ 	if (ret)
+ 		return ret;
+ 
+ 	if (write) {
+ 		if ((param->min && *param->min > tmp) ||
+ 		    (param->max && *param->max < tmp))
+ 			return -EINVAL;
+ 		*valp = tmp;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ /**
+  * proc_dointvec_minmax - read a vector of integers with min/max values
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+  * values from/to the user buffer, treated as an ASCII string.
+  *
+  * This routine will ensure the values are within the range specified by
+  * table->extra1 (min) and table->extra2 (max).
+  *
+  * Returns 0 on success or -EINVAL on write when the range check fails.
+  */
+ int proc_dointvec_minmax(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	struct do_proc_dointvec_minmax_conv_param param = {
+ 		.min = (int *) table->extra1,
+ 		.max = (int *) table->extra2,
+ 	};
+ 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ 				do_proc_dointvec_minmax_conv, &param);
+ }
+ 
+ /**
+  * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+  * @min: pointer to minimum allowable value
+  * @max: pointer to maximum allowable value
+  *
+  * The do_proc_douintvec_minmax_conv_param structure provides the
+  * minimum and maximum values for doing range checking for those sysctl
+  * parameters that use the proc_douintvec_minmax() handler.
+  */
+ struct do_proc_douintvec_minmax_conv_param {
+ 	unsigned int *min;
+ 	unsigned int *max;
+ };
+ 
+ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ 					 unsigned int *valp,
+ 					 int write, void *data)
+ {
+ 	int ret;
+ 	unsigned int tmp;
+ 	struct do_proc_douintvec_minmax_conv_param *param = data;
+ 	/* write via temporary local uint for bounds-checking */
+ 	unsigned int *up = write ? &tmp : valp;
+ 
+ 	ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ 	if (ret)
+ 		return ret;
+ 
+ 	if (write) {
+ 		if ((param->min && *param->min > tmp) ||
+ 		    (param->max && *param->max < tmp))
+ 			return -ERANGE;
+ 
+ 		*valp = tmp;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ /**
+  * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+  * values from/to the user buffer, treated as an ASCII string. Negative
+  * strings are not allowed.
+  *
+  * This routine will ensure the values are within the range specified by
+  * table->extra1 (min) and table->extra2 (max). There is a final sanity
+  * check for UINT_MAX to avoid having to support wrap around uses from
+  * userspace.
+  *
+  * Returns 0 on success or -ERANGE on write when the range check fails.
+  */
+ int proc_douintvec_minmax(struct ctl_table *table, int write,
+ 			  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	struct do_proc_douintvec_minmax_conv_param param = {
+ 		.min = (unsigned int *) table->extra1,
+ 		.max = (unsigned int *) table->extra2,
+ 	};
+ 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ 				 do_proc_douintvec_minmax_conv, &param);
+ }
+ 
+ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+ 					unsigned int *valp,
+ 					int write, void *data)
+ {
+ 	if (write) {
+ 		unsigned int val;
+ 
+ 		val = round_pipe_size(*lvalp);
+ 		if (val == 0)
+ 			return -EINVAL;
+ 
+ 		*valp = val;
+ 	} else {
+ 		unsigned int val = *valp;
+ 		*lvalp = (unsigned long) val;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static int proc_dopipe_max_size(struct ctl_table *table, int write,
+ 				void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ 				 do_proc_dopipe_max_size_conv, NULL);
+ }
+ 
+ static void validate_coredump_safety(void)
+ {
+ #ifdef CONFIG_COREDUMP
+ 	if (suid_dumpable == SUID_DUMP_ROOT &&
+ 	    core_pattern[0] != '/' && core_pattern[0] != '|') {
+ 		printk(KERN_WARNING
+ "Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+ "Pipe handler or fully qualified core dump path required.\n"
+ "Set kernel.core_pattern before fs.suid_dumpable.\n"
+ 		);
+ 	}
+ #endif
+ }
+ 
+ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+ 		void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ 	if (!error)
+ 		validate_coredump_safety();
+ 	return error;
+ }
+ 
+ #ifdef CONFIG_COREDUMP
+ static int proc_dostring_coredump(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	int error = proc_dostring(table, write, buffer, lenp, ppos);
+ 	if (!error)
+ 		validate_coredump_safety();
+ 	return error;
+ }
+ #endif
+ 
+ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
+ 				     void __user *buffer,
+ 				     size_t *lenp, loff_t *ppos,
+ 				     unsigned long convmul,
+ 				     unsigned long convdiv)
+ {
+ 	unsigned long *i, *min, *max;
+ 	int vleft, first = 1, err = 0;
+ 	size_t left;
+ 	char *kbuf = NULL, *p;
+ 
+ 	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ 		*lenp = 0;
+ 		return 0;
+ 	}
+ 
+ 	i = (unsigned long *) data;
+ 	min = (unsigned long *) table->extra1;
+ 	max = (unsigned long *) table->extra2;
+ 	vleft = table->maxlen / sizeof(unsigned long);
+ 	left = *lenp;
+ 
+ 	if (write) {
+ 		if (proc_first_pos_non_zero_ignore(ppos, table))
+ 			goto out;
+ 
+ 		if (left > PAGE_SIZE - 1)
+ 			left = PAGE_SIZE - 1;
+ 		p = kbuf = memdup_user_nul(buffer, left);
+ 		if (IS_ERR(kbuf))
+ 			return PTR_ERR(kbuf);
+ 	}
+ 
+ 	for (; left && vleft--; i++, first = 0) {
+ 		unsigned long val;
+ 
+ 		if (write) {
+ 			bool neg;
+ 
+ 			left -= proc_skip_spaces(&p);
+ 			if (!left)
+ 				break;
+ 
+ 			err = proc_get_long(&p, &left, &val, &neg,
+ 					     proc_wspace_sep,
+ 					     sizeof(proc_wspace_sep), NULL);
+ 			if (err)
+ 				break;
+ 			if (neg)
+ 				continue;
+ 			val = convmul * val / convdiv;
+ 			if ((min && val < *min) || (max && val > *max)) {
+ 				err = -EINVAL;
+ 				break;
+ 			}
+ 			*i = val;
+ 		} else {
+ 			val = convdiv * (*i) / convmul;
+ 			if (!first) {
+ 				err = proc_put_char(&buffer, &left, '\t');
+ 				if (err)
+ 					break;
+ 			}
+ 			err = proc_put_long(&buffer, &left, val, false);
+ 			if (err)
+ 				break;
+ 		}
+ 	}
+ 
+ 	if (!write && !first && left && !err)
+ 		err = proc_put_char(&buffer, &left, '\n');
+ 	if (write && !err)
+ 		left -= proc_skip_spaces(&p);
+ 	if (write) {
+ 		kfree(kbuf);
+ 		if (first)
+ 			return err ? : -EINVAL;
+ 	}
+ 	*lenp -= left;
+ out:
+ 	*ppos += *lenp;
+ 	return err;
+ }
+ 
+ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
+ 				     void __user *buffer,
+ 				     size_t *lenp, loff_t *ppos,
+ 				     unsigned long convmul,
+ 				     unsigned long convdiv)
+ {
+ 	return __do_proc_doulongvec_minmax(table->data, table, write,
+ 			buffer, lenp, ppos, convmul, convdiv);
+ }
+ 
+ /**
+  * proc_doulongvec_minmax - read a vector of long integers with min/max values
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+  * values from/to the user buffer, treated as an ASCII string.
+  *
+  * This routine will ensure the values are within the range specified by
+  * table->extra1 (min) and table->extra2 (max).
+  *
+  * Returns 0 on success.
+  */
+ int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ 			   void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+     return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
+ }
+ 
+ /**
+  * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+  * values from/to the user buffer, treated as an ASCII string. The values
+  * are treated as milliseconds, and converted to jiffies when they are stored.
+  *
+  * This routine will ensure the values are within the range specified by
+  * table->extra1 (min) and table->extra2 (max).
+  *
+  * Returns 0 on success.
+  */
+ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ 				      void __user *buffer,
+ 				      size_t *lenp, loff_t *ppos)
+ {
+     return do_proc_doulongvec_minmax(table, write, buffer,
+ 				     lenp, ppos, HZ, 1000l);
+ }
+ 
+ 
+ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
+ 					 int *valp,
+ 					 int write, void *data)
+ {
+ 	if (write) {
+ 		if (*lvalp > INT_MAX / HZ)
+ 			return 1;
+ 		*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
+ 	} else {
+ 		int val = *valp;
+ 		unsigned long lval;
+ 		if (val < 0) {
+ 			*negp = true;
+ 			lval = -(unsigned long)val;
+ 		} else {
+ 			*negp = false;
+ 			lval = (unsigned long)val;
+ 		}
+ 		*lvalp = lval / HZ;
+ 	}
+ 	return 0;
+ }
+ 
+ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
+ 						int *valp,
+ 						int write, void *data)
+ {
+ 	if (write) {
+ 		if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+ 			return 1;
+ 		*valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
+ 	} else {
+ 		int val = *valp;
+ 		unsigned long lval;
+ 		if (val < 0) {
+ 			*negp = true;
+ 			lval = -(unsigned long)val;
+ 		} else {
+ 			*negp = false;
+ 			lval = (unsigned long)val;
+ 		}
+ 		*lvalp = jiffies_to_clock_t(lval);
+ 	}
+ 	return 0;
+ }
+ 
+ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
+ 					    int *valp,
+ 					    int write, void *data)
+ {
+ 	if (write) {
+ 		unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
+ 
+ 		if (jif > INT_MAX)
+ 			return 1;
+ 		*valp = (int)jif;
+ 	} else {
+ 		int val = *valp;
+ 		unsigned long lval;
+ 		if (val < 0) {
+ 			*negp = true;
+ 			lval = -(unsigned long)val;
+ 		} else {
+ 			*negp = false;
+ 			lval = (unsigned long)val;
+ 		}
+ 		*lvalp = jiffies_to_msecs(lval);
+ 	}
+ 	return 0;
+ }
+ 
+ /**
+  * proc_dointvec_jiffies - read a vector of integers as seconds
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+  * values from/to the user buffer, treated as an ASCII string. 
+  * The values read are assumed to be in seconds, and are converted into
+  * jiffies.
+  *
+  * Returns 0 on success.
+  */
+ int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ 			  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+     return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ 		    	    do_proc_dointvec_jiffies_conv,NULL);
+ }
+ 
+ /**
+  * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: pointer to the file position
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+  * values from/to the user buffer, treated as an ASCII string. 
+  * The values read are assumed to be in 1/USER_HZ seconds, and 
+  * are converted into jiffies.
+  *
+  * Returns 0 on success.
+  */
+ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ 				 void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+     return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ 		    	    do_proc_dointvec_userhz_jiffies_conv,NULL);
+ }
+ 
+ /**
+  * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  * @ppos: the current position in the file
+  *
+  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+  * values from/to the user buffer, treated as an ASCII string. 
+  * The values read are assumed to be in 1/1000 seconds, and 
+  * are converted into jiffies.
+  *
+  * Returns 0 on success.
+  */
+ int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ 			     void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ 				do_proc_dointvec_ms_jiffies_conv, NULL);
+ }
+ 
+ static int proc_do_cad_pid(struct ctl_table *table, int write,
+ 			   void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	struct pid *new_pid;
+ 	pid_t tmp;
+ 	int r;
+ 
+ 	tmp = pid_vnr(cad_pid);
+ 
+ 	r = __do_proc_dointvec(&tmp, table, write, buffer,
+ 			       lenp, ppos, NULL, NULL);
+ 	if (r || !write)
+ 		return r;
+ 
+ 	new_pid = find_get_pid(tmp);
+ 	if (!new_pid)
+ 		return -ESRCH;
+ 
+ 	put_pid(xchg(&cad_pid, new_pid));
+ 	return 0;
+ }
+ 
+ /**
+  * proc_do_large_bitmap - read/write from/to a large bitmap
+  * @table: the sysctl table
+  * @write: %TRUE if this is a write to the sysctl file
+  * @buffer: the user buffer
+  * @lenp: the size of the user buffer
+  * @ppos: file position
+  *
+  * The bitmap is stored at table->data and the bitmap length (in bits)
+  * in table->maxlen.
+  *
+  * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+  * large bitmaps may be represented in a compact manner. Writing into
+  * the file will clear the bitmap then update it with the given input.
+  *
+  * Returns 0 on success.
+  */
+ int proc_do_large_bitmap(struct ctl_table *table, int write,
+ 			 void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	int err = 0;
+ 	bool first = 1;
+ 	size_t left = *lenp;
+ 	unsigned long bitmap_len = table->maxlen;
+ 	unsigned long *bitmap = *(unsigned long **) table->data;
+ 	unsigned long *tmp_bitmap = NULL;
+ 	char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
+ 
+ 	if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
+ 		*lenp = 0;
+ 		return 0;
+ 	}
+ 
+ 	if (write) {
+ 		char *kbuf, *p;
+ 		size_t skipped = 0;
+ 
+ 		if (left > PAGE_SIZE - 1) {
+ 			left = PAGE_SIZE - 1;
+ 			/* How much of the buffer we'll skip this pass */
+ 			skipped = *lenp - left;
+ 		}
+ 
+ 		p = kbuf = memdup_user_nul(buffer, left);
+ 		if (IS_ERR(kbuf))
+ 			return PTR_ERR(kbuf);
+ 
+ 		tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
+ 		if (!tmp_bitmap) {
+ 			kfree(kbuf);
+ 			return -ENOMEM;
+ 		}
+ 		proc_skip_char(&p, &left, '\n');
+ 		while (!err && left) {
+ 			unsigned long val_a, val_b;
+ 			bool neg;
+ 			size_t saved_left;
+ 
+ 			/* In case we stop parsing mid-number, we can reset */
+ 			saved_left = left;
+ 			err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
+ 					     sizeof(tr_a), &c);
+ 			/*
+ 			 * If we consumed the entirety of a truncated buffer or
+ 			 * only one char is left (may be a "-"), then stop here,
+ 			 * reset, & come back for more.
+ 			 */
+ 			if ((left <= 1) && skipped) {
+ 				left = saved_left;
+ 				break;
+ 			}
+ 
+ 			if (err)
+ 				break;
+ 			if (val_a >= bitmap_len || neg) {
+ 				err = -EINVAL;
+ 				break;
+ 			}
+ 
+ 			val_b = val_a;
+ 			if (left) {
+ 				p++;
+ 				left--;
+ 			}
+ 
+ 			if (c == '-') {
+ 				err = proc_get_long(&p, &left, &val_b,
+ 						     &neg, tr_b, sizeof(tr_b),
+ 						     &c);
+ 				/*
+ 				 * If we consumed all of a truncated buffer or
+ 				 * then stop here, reset, & come back for more.
+ 				 */
+ 				if (!left && skipped) {
+ 					left = saved_left;
+ 					break;
+ 				}
+ 
+ 				if (err)
+ 					break;
+ 				if (val_b >= bitmap_len || neg ||
+ 				    val_a > val_b) {
+ 					err = -EINVAL;
+ 					break;
+ 				}
+ 				if (left) {
+ 					p++;
+ 					left--;
+ 				}
+ 			}
+ 
+ 			bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
+ 			first = 0;
+ 			proc_skip_char(&p, &left, '\n');
+ 		}
+ 		kfree(kbuf);
+ 		left += skipped;
+ 	} else {
+ 		unsigned long bit_a, bit_b = 0;
+ 
+ 		while (left) {
+ 			bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+ 			if (bit_a >= bitmap_len)
+ 				break;
+ 			bit_b = find_next_zero_bit(bitmap, bitmap_len,
+ 						   bit_a + 1) - 1;
+ 
+ 			if (!first) {
+ 				err = proc_put_char(&buffer, &left, ',');
+ 				if (err)
+ 					break;
+ 			}
+ 			err = proc_put_long(&buffer, &left, bit_a, false);
+ 			if (err)
+ 				break;
+ 			if (bit_a != bit_b) {
+ 				err = proc_put_char(&buffer, &left, '-');
+ 				if (err)
+ 					break;
+ 				err = proc_put_long(&buffer, &left, bit_b, false);
+ 				if (err)
+ 					break;
+ 			}
+ 
+ 			first = 0; bit_b++;
+ 		}
+ 		if (!err)
+ 			err = proc_put_char(&buffer, &left, '\n');
+ 	}
+ 
+ 	if (!err) {
+ 		if (write) {
+ 			if (*ppos)
+ 				bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
+ 			else
+ 				bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
+ 		}
+ 		*lenp -= left;
+ 		*ppos += *lenp;
+ 	}
+ 
+ 	bitmap_free(tmp_bitmap);
+ 	return err;
+ }
+ 
+ #else /* CONFIG_PROC_SYSCTL */
+ 
+ int proc_dostring(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_dointvec(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_douintvec(struct ctl_table *table, int write,
+ 		  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_dointvec_minmax(struct ctl_table *table, int write,
+ 		    void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_douintvec_minmax(struct ctl_table *table, int write,
+ 			  void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
+ 				  void *buffer, size_t *lenp, loff_t *ppos)
+ {
+        return -ENOSYS;
+ }
+ 
+ int proc_dointvec_jiffies(struct ctl_table *table, int write,
+ 		    void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
+ 		    void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
+ 			     void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_doulongvec_minmax(struct ctl_table *table, int write,
+ 		    void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ 				      void __user *buffer,
+ 				      size_t *lenp, loff_t *ppos)
+ {
+     return -ENOSYS;
+ }
+ 
+ int proc_do_large_bitmap(struct ctl_table *table, int write,
+ 			 void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ 	return -ENOSYS;
+ }
+ 
+ #endif /* CONFIG_PROC_SYSCTL */
+ 
+ #if defined(CONFIG_SYSCTL)
+ int proc_do_static_key(struct ctl_table *table, int write,
+ 		       void __user *buffer, size_t *lenp,
+ 		       loff_t *ppos)
+ {
+ 	struct static_key *key = (struct static_key *)table->data;
+ 	static DEFINE_MUTEX(static_key_mutex);
+ 	int val, ret;
+ 	struct ctl_table tmp = {
+ 		.data   = &val,
+ 		.maxlen = sizeof(val),
+ 		.mode   = table->mode,
+ 		.extra1 = SYSCTL_ZERO,
+ 		.extra2 = SYSCTL_ONE,
+ 	};
+ 
+ 	if (write && !capable(CAP_SYS_ADMIN))
+ 		return -EPERM;
+ 
+ 	mutex_lock(&static_key_mutex);
+ 	val = static_key_enabled(key);
+ 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ 	if (write && !ret) {
+ 		if (val)
+ 			static_key_enable(key);
+ 		else
+ 			static_key_disable(key);
+ 	}
+ 	mutex_unlock(&static_key_mutex);
+ 	return ret;
+ }
+ #endif
+ /*
+  * No sense putting this after each symbol definition, twice,
+  * exception granted :-)
+  */
+ EXPORT_SYMBOL(proc_dointvec);
+ EXPORT_SYMBOL(proc_douintvec);
+ EXPORT_SYMBOL(proc_dointvec_jiffies);
+ EXPORT_SYMBOL(proc_dointvec_minmax);
+ EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+ EXPORT_SYMBOL(proc_dointvec_minmax_sysadmin);
+ EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+ EXPORT_SYMBOL(proc_dostring);
+ EXPORT_SYMBOL(proc_doulongvec_minmax);
+ EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+ EXPORT_SYMBOL(proc_do_large_bitmap);
diff --color -rcNP Master/kernel/sysctl.c.rej OG/kernel/sysctl.c.rej
*** Master/kernel/sysctl.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/kernel/sysctl.c.rej	2021-04-20 15:11:27.328000000 -0400
***************
*** 0 ****
--- 1,209 ----
+ *** kernel/sysctl.c	2021-03-13 19:12:17.000000000 +0200
+ --- kernel/sysctl.c	2021-03-13 19:47:21.000000000 +0200
+ ***************
+ *** 1386,1392 ****
+   		.proc_handler	= overcommit_kbytes_handler,
+   	},
+   	{
+ ! 		.procname	= "page-cluster",
+   		.data		= &page_cluster,
+   		.maxlen		= sizeof(int),
+   		.mode		= 0644,
+ --- 1338,1344 ----
+   		.proc_handler	= overcommit_kbytes_handler,
+   	},
+   	{
+ ! 		.procname	= "page-cluster",
+   		.data		= &page_cluster,
+   		.maxlen		= sizeof(int),
+   		.mode		= 0644,
+ ***************
+ *** 1904,1910 ****
+   		.mode		= 0555,
+   		.child		= inotify_table,
+   	},
+ ! #endif
+   #ifdef CONFIG_EPOLL
+   	{
+   		.procname	= "epoll",
+ --- 1856,1862 ----
+   		.mode		= 0555,
+   		.child		= inotify_table,
+   	},
+ ! #endif
+   #ifdef CONFIG_EPOLL
+   	{
+   		.procname	= "epoll",
+ ***************
+ *** 2385,2396 ****
+   	int *i, vleft, first = 1, err = 0;
+   	size_t left;
+   	char *kbuf = NULL, *p;
+ ! 
+   	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+   		*lenp = 0;
+   		return 0;
+   	}
+ ! 
+   	i = (int *) tbl_data;
+   	vleft = table->maxlen / sizeof(*i);
+   	left = *lenp;
+ --- 2337,2348 ----
+   	int *i, vleft, first = 1, err = 0;
+   	size_t left;
+   	char *kbuf = NULL, *p;
+ ! 
+   	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+   		*lenp = 0;
+   		return 0;
+   	}
+ ! 
+   	i = (int *) tbl_data;
+   	vleft = table->maxlen / sizeof(*i);
+   	left = *lenp;
+ ***************
+ *** 2646,2727 ****
+   				 do_proc_douintvec_conv, NULL);
+   }
+   
+ - static int do_proc_dointvec_conv_secure(bool *negp, unsigned long *lvalp,
+ - 				 int *valp,
+ - 				 int write, void *data)
+ - {
+ - 	if (write) {
+ - 		if (*negp) {
+ - 			if (*lvalp > (unsigned long) INT_MAX + 1)
+ - 				return -EINVAL;
+ - //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ - 			*valp = -*lvalp;
+ - //			pax_close_kernel();
+ - 		} else {
+ - 			if (*lvalp > (unsigned long) INT_MAX)
+ - 				return -EINVAL;
+ - //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ - 			*valp = *lvalp;
+ - //			pax_close_kernel();
+ - 		}
+ - 	} else {
+ - 		int val = *valp;
+ - 		if (val < 0) {
+ - 			*negp = true;
+ - 			*lvalp = -(unsigned long)val;
+ - 		} else {
+ - 			*negp = false;
+ - 			*lvalp = (unsigned long)val;
+ - 		}
+ - 	}
+ - 	return 0;
+ - }
+ - 
+ - int proc_dointvec_secure(struct ctl_table *table, int write,
+ - 		     void __user *buffer, size_t *lenp, loff_t *ppos)
+ - {
+ - 	return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ - 				do_proc_dointvec_conv_secure,NULL);
+ - }
+ - 
+ - static int do_proc_dointvec_conv_secure(bool *negp, unsigned long *lvalp,
+ - 				 int *valp,
+ - 				 int write, void *data)
+ - {
+ - 	if (write) {
+ - 		if (*negp) {
+ - 			if (*lvalp > (unsigned long) INT_MAX + 1)
+ - 				return -EINVAL;
+ - //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ - 			*valp = -*lvalp;
+ - //			pax_close_kernel();
+ - 		} else {
+ - 			if (*lvalp > (unsigned long) INT_MAX)
+ - 				return -EINVAL;
+ - //			pax_open_kernel(); for PAX_KERNEXEC ;)
+ - 			*valp = *lvalp;
+ - //			pax_close_kernel();
+ - 		}
+ - 	} else {
+ - 		int val = *valp;
+ - 		if (val < 0) {
+ - 			*negp = true;
+ - 			*lvalp = -(unsigned long)val;
+ - 		} else {
+ - 			*negp = false;
+ - 			*lvalp = (unsigned long)val;
+ - 		}
+ - 	}
+ - 	return 0;
+ - }
+ - 
+ - int proc_dointvec_secure(struct ctl_table *table, int write,
+ - 		     void __user *buffer, size_t *lenp, loff_t *ppos)
+ - {
+ - 	return do_proc_dointvec(table,write,buffer,lenp,ppos,
+ - 				do_proc_dointvec_conv_secure,NULL);
+ - }
+ - 
+   /*
+    * Taint values can only be increased
+    * This means we can safely use a temporary.
+ --- 2598,2603 ----
+   				 do_proc_douintvec_conv, NULL);
+   }
+   
+   /*
+    * Taint values can only be increased
+    * This means we can safely use a temporary.
+ ***************
+ *** 3205,3211 ****
+    * @ppos: file position
+    *
+    * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ !  * values from/to the user buffer, treated as an ASCII string.
+    * The values read are assumed to be in seconds, and are converted into
+    * jiffies.
+    *
+ --- 3081,3087 ----
+    * @ppos: file position
+    *
+    * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ !  * values from/to the user buffer, treated as an ASCII string.
+    * The values read are assumed to be in seconds, and are converted into
+    * jiffies.
+    *
+ ***************
+ *** 3227,3234 ****
+    * @ppos: pointer to the file position
+    *
+    * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ !  * values from/to the user buffer, treated as an ASCII string.
+ !  * The values read are assumed to be in 1/USER_HZ seconds, and
+    * are converted into jiffies.
+    *
+    * Returns 0 on success.
+ --- 3103,3110 ----
+    * @ppos: pointer to the file position
+    *
+    * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ !  * values from/to the user buffer, treated as an ASCII string.
+ !  * The values read are assumed to be in 1/USER_HZ seconds, and
+    * are converted into jiffies.
+    *
+    * Returns 0 on success.
+ ***************
+ *** 3250,3257 ****
+    * @ppos: the current position in the file
+    *
+    * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ !  * values from/to the user buffer, treated as an ASCII string.
+ !  * The values read are assumed to be in 1/1000 seconds, and
+    * are converted into jiffies.
+    *
+    * Returns 0 on success.
+ --- 3126,3133 ----
+    * @ppos: the current position in the file
+    *
+    * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ !  * values from/to the user buffer, treated as an ASCII string.
+ !  * The values read are assumed to be in 1/1000 seconds, and
+    * are converted into jiffies.
+    *
+    * Returns 0 on success.
diff --color -rcNP Master/mm/mmap.c OG/mm/mmap.c
*** Master/mm/mmap.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/mmap.c	2021-04-20 15:11:34.520000000 -0400
***************
*** 1464,1469 ****
--- 1464,1493 ----
  	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
  			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
  
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (mm->pax_flags & MF_PAX_MPROTECT) {
+ 
+ #ifndef CONFIG_MINISEC_MPROTECT_COMPAT
+ 		if ((vm_flags & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC)) {
+ 			return -EPERM;
+ 		}
+ 
+ 		if (!(vm_flags & VM_EXEC))
+ 			vm_flags &= ~VM_MAYEXEC;
+ #else
+ 		if ((vm_flags & (VM_WRITE | VM_EXEC)) != VM_EXEC)
+ 			vm_flags &= ~(VM_EXEC | VM_MAYEXEC);
+ #endif
+ 		else
+ 			vm_flags &= ~VM_MAYWRITE;
+ 	}
+ #endif
+ 
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_X86_32)
+ 	if ((mm->pax_flags & MF_PAX_PAGEEXEC) && file)
+ 		vm_flags &= ~VM_PAGEEXEC;
+ #endif
+ 
  	if (flags & MAP_LOCKED)
  		if (!can_do_mlock())
  			return -EPERM;
***************
*** 1804,1809 ****
--- 1828,1840 ----
  		if (error)
  			goto unmap_and_free_vma;
  
+ #if defined(CONFIG_MINISEC_PAGEEXEC) && defined(CONFIG_X86_32)
+ 		if ((mm->pax_flags & MF_PAX_PAGEEXEC) && !(vma->vm_flags & VM_SPECIAL)) {
+ 			vma->vm_flags |= VM_PAGEEXEC;
+ 			vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ 		}
+ #endif
+ 
  		/* Can addr have changed??
  		 *
  		 * Answer: Yes, several device drivers can do it in their
***************
*** 3023,3028 ****
--- 3054,3071 ----
  		return -EINVAL;
  	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
  
+ #if defined(CONFIG_MINISEC_PAGEEXEC)
+ 	if (mm->pax_flags & MF_PAX_PAGEEXEC) {
+ 		flags &= ~VM_EXEC;
+ 
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 		if (mm->pax_flags & MF_PAX_MPROTECT)
+ 			flags &= ~VM_MAYEXEC;
+ #endif
+ 
+ 	}
+ #endif
+ 
  	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
  	if (offset_in_page(error))
  		return error;
***************
*** 3431,3436 ****
--- 3474,3496 ----
  	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
  	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
  
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (mm->pax_flags & MF_PAX_MPROTECT) {
+ #ifndef CONFIG_MINISEC_MPROTECT_COMPAT
+ 		if ((vm_flags & (VM_WRITE | VM_EXEC)) == (VM_WRITE | VM_EXEC))
+ 			return ERR_PTR(-EPERM);
+ 
+ 		if (!(vm_flags & VM_EXEC))
+ 			vm_flags &= ~VM_MAYEXEC;
+ #else
+ 		if ((vm_flags & (VM_WRITE | VM_EXEC)) != VM_EXEC)
+ 			vm_flags &= ~(VM_EXEC | VM_MAYEXEC);
+ #endif
+ 		else
+ 			vm_flags &= ~VM_MAYWRITE;
+ 	}
+ #endif
+ 
  	vma->vm_ops = ops;
  	vma->vm_private_data = priv;
  
diff --color -rcNP Master/mm/mprotect.c OG/mm/mprotect.c
*** Master/mm/mprotect.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/mprotect.c	2021-04-20 15:11:34.520000000 -0400
***************
*** 26,31 ****
--- 26,35 ----
  #include <linux/perf_event.h>
  #include <linux/pkeys.h>
  #include <linux/ksm.h>
+ #ifdef CONFIG_MINISEC_MPROTECT
+ #include <linux/elf.h>
+ #include <linux/binfmts.h>
+ #endif
  #include <linux/uaccess.h>
  #include <linux/mm_inline.h>
  #include <asm/pgtable.h>
***************
*** 455,460 ****
--- 459,468 ----
  	 * held in write mode.
  	 */
  	vma->vm_flags = newflags;
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (mm->binfmt && mm->binfmt->handle_mprotect)
+ 		mm->binfmt->handle_mprotect(vma, newflags);
+ #endif
  	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
  	vma_set_page_prot(vma);
  
***************
*** 548,553 ****
--- 556,566 ----
  	if (start > vma->vm_start)
  		prev = vma;
  
+ #ifdef CONFIG_MINISEC_MPROTECT
+ 	if (current->mm->binfmt && current->mm->binfmt->handle_mprotect)
+ 		current->mm->binfmt->handle_mprotect(vma, calc_vm_prot_bits(prot, 0));
+ #endif
+ 
  	for (nstart = start ; ; ) {
  		unsigned long mask_off_old_flags;
  		unsigned long newflags;
diff --color -rcNP Master/mm/shmem.c OG/mm/shmem.c
*** Master/mm/shmem.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/shmem.c	2021-04-20 15:11:34.521000000 -0400
***************
*** 3248,3253 ****
--- 3248,3271 ----
  	return simple_xattr_set(&info->xattrs, name, value, size, flags);
  }
  
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ static int shmem_user_xattr_handler_set(const struct xattr_handler *handler,
+ 					struct dentry *dentry, struct inode *inode,
+ 					const char *name, const void *value,
+ 					size_t size, int flags)
+ {
+ 	struct shmem_inode_info *info = SHMEM_I(inode);
+ 
+ 	if (strcmp(name, XATTR_NAME_PAX_FLAGS))
+ 		return -EOPNOTSUPP;
+ 	if (size > 8)
+ 		return -EINVAL;
+ 
+ 	name = xattr_full_name(handler, name);
+ 	return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ }
+ #endif
+ 
  static const struct xattr_handler shmem_security_xattr_handler = {
  	.prefix = XATTR_SECURITY_PREFIX,
  	.get = shmem_xattr_handler_get,
***************
*** 3260,3265 ****
--- 3278,3291 ----
  	.set = shmem_xattr_handler_set,
  };
  
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ static const struct xattr_handler shmem_user_xattr_handler = {
+ 	.prefix = XATTR_USER_PREFIX,
+ 	.get = shmem_xattr_handler_get,
+ 	.set = shmem_user_xattr_handler_set,
+ };
+ #endif
+ 
  static const struct xattr_handler *shmem_xattr_handlers[] = {
  #ifdef CONFIG_TMPFS_POSIX_ACL
  	&posix_acl_access_xattr_handler,
***************
*** 3267,3272 ****
--- 3293,3301 ----
  #endif
  	&shmem_security_xattr_handler,
  	&shmem_trusted_xattr_handler,
+ #ifdef CONFIG_MINISEC_XATTR_PAX_FLAGS
+ 	&shmem_user_xattr_handler,
+ #endif
  	NULL
  };
  
diff --color -rcNP Master/mm/slab_common.c OG/mm/slab_common.c
*** Master/mm/slab_common.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/slab_common.c	2021-04-20 15:11:34.521000000 -0400
***************
*** 598,604 ****
  	rcu_barrier();
  
  	list_for_each_entry_safe(s, s2, &to_destroy, list) {
! #ifdef SLAB_SUPPORTS_SYSFS
  		sysfs_slab_release(s);
  #else
  		slab_kmem_cache_release(s);
--- 598,604 ----
  	rcu_barrier();
  
  	list_for_each_entry_safe(s, s2, &to_destroy, list) {
! #if defined(SLAB_SUPPORTS_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  		sysfs_slab_release(s);
  #else
  		slab_kmem_cache_release(s);
***************
*** 618,630 ****
  	list_del(&s->list);
  
  	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
! #ifdef SLAB_SUPPORTS_SYSFS
  		sysfs_slab_unlink(s);
  #endif
  		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
  		schedule_work(&slab_caches_to_rcu_destroy_work);
  	} else {
! #ifdef SLAB_SUPPORTS_SYSFS
  		sysfs_slab_unlink(s);
  		sysfs_slab_release(s);
  #else
--- 618,630 ----
  	list_del(&s->list);
  
  	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
! #if defined(SLAB_SUPPORTS_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  		sysfs_slab_unlink(s);
  #endif
  		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
  		schedule_work(&slab_caches_to_rcu_destroy_work);
  	} else {
! #if defined(SLAB_SUPPORTS_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  		sysfs_slab_unlink(s);
  		sysfs_slab_release(s);
  #else
diff --color -rcNP Master/mm/slub.c OG/mm/slub.c
*** Master/mm/slub.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/slub.c	2021-04-20 15:11:34.522000000 -0400
***************
*** 219,225 ****
  
  enum track_item { TRACK_ALLOC, TRACK_FREE };
  
! #ifdef CONFIG_SYSFS
  static int sysfs_slab_add(struct kmem_cache *);
  static int sysfs_slab_alias(struct kmem_cache *, const char *);
  static void memcg_propagate_slab_attrs(struct kmem_cache *s);
--- 219,225 ----
  
  enum track_item { TRACK_ALLOC, TRACK_FREE };
  
! #if defined(CONFIG_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  static int sysfs_slab_add(struct kmem_cache *);
  static int sysfs_slab_alias(struct kmem_cache *, const char *);
  static void memcg_propagate_slab_attrs(struct kmem_cache *s);
***************
*** 4934,4940 ****
  #endif
  #endif	/* SLUB_RESILIENCY_TEST */
  
! #ifdef CONFIG_SYSFS
  enum slab_stat_type {
  	SL_ALL,			/* All slabs */
  	SL_PARTIAL,		/* Only partially allocated slabs */
--- 4934,4940 ----
  #endif
  #endif	/* SLUB_RESILIENCY_TEST */
  
! #if defined(CONFIG_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  enum slab_stat_type {
  	SL_ALL,			/* All slabs */
  	SL_PARTIAL,		/* Only partially allocated slabs */
***************
*** 5903,5908 ****
--- 5903,5909 ----
  	kobject_put(&s->kobj);
  }
  
+ #if defined(CONFIG_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  static int sysfs_slab_add(struct kmem_cache *s)
  {
  	int err;
***************
*** 5982,5987 ****
--- 5983,5989 ----
  	kobject_get(&s->kobj);
  	schedule_work(&s->kobj_remove_work);
  }
+ #endif
  
  void sysfs_slab_unlink(struct kmem_cache *s)
  {
***************
*** 6007,6012 ****
--- 6009,6015 ----
  
  static struct saved_alias *alias_list;
  
+ #if defined(CONFIG_SYSFS) && !defined(CONFIG_MINISEC_PROC_ADD)
  static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
  {
  	struct saved_alias *al;
***************
*** 6029,6034 ****
--- 6032,6038 ----
  	alias_list = al;
  	return 0;
  }
+ #endif
  
  static int __init slab_sysfs_init(void)
  {
diff --color -rcNP Master/mm/util.c OG/mm/util.c
*** Master/mm/util.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/util.c	2021-04-20 15:11:34.522000000 -0400
***************
*** 308,313 ****
--- 308,318 ----
  {
  	unsigned long random_variable = 0;
  
+ #ifdef CONFIG_MINISEC_RANDUSTACK
+ 	if (current->mm->pax_flags & MF_PAX_RANDMMAP)
+ 		return stack_top - current->mm->delta_stack;
+ #endif
+ 
  	if (current->flags & PF_RANDOMIZE) {
  		random_variable = get_random_long();
  		random_variable &= STACK_RND_MASK;
diff --color -rcNP Master/mm/vmstat.c OG/mm/vmstat.c
*** Master/mm/vmstat.c	2021-04-20 14:17:31.000000000 -0400
--- OG/mm/vmstat.c	2021-04-20 15:11:34.522000000 -0400
***************
*** 28,33 ****
--- 28,34 ----
  #include <linux/mm_inline.h>
  #include <linux/page_ext.h>
  #include <linux/page_owner.h>
+ #include <linux/minisec.h>
  
  #include "internal.h"
  
***************
*** 1690,1695 ****
--- 1691,1707 ----
  		v[i] = global_zone_page_state(i);
  	v += NR_VM_ZONE_STAT_ITEMS;
  
+ #ifdef CONFIG_MINISEC_PROC_ADD
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+         if (!uid_eq(current_uid(), GLOBAL_ROOT_UID)
+ #ifdef CONFIG_MINISEC_PROC_USERGROUP
+                 && !in_group_p(grsec_proc_gid)
+ #endif
+         )
+ 		return (unsigned long *)m->private + *pos;
+ #endif
+ #endif
+ 
  #ifdef CONFIG_NUMA
  	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
  		v[i] = global_numa_state(i);
***************
*** 1988,1997 ****
  	start_shepherd_timer();
  #endif
  #ifdef CONFIG_PROC_FS
! 	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
! 	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
! 	proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
! 	proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
  #endif
  }
  
--- 2000,2016 ----
  	start_shepherd_timer();
  #endif
  #ifdef CONFIG_PROC_FS
! {
! 	mode_t gr_mode = S_IRUGO;
! #ifdef CONFIG_MINISEC_PROC_ADD
! 	gr_mode = S_IRUSR;
! #endif
! 	proc_create_seq("buddyinfo", gr_mode, NULL, &fragmentation_op);
! 	proc_create_seq("pagetypeinfo", gr_mode, NULL, &pagetypeinfo_op);
! 	proc_create_seq("vmstat", gr_mode, NULL, &vmstat_op);
! 	proc_create_seq("zoneinfo", gr_mode, NULL, &zoneinfo_op);
! }
! 
  #endif
  }
  
diff --color -rcNP Master/net/unix/af_unix.c OG/net/unix/af_unix.c
*** Master/net/unix/af_unix.c	2021-04-20 14:17:32.000000000 -0400
--- OG/net/unix/af_unix.c	2021-04-20 15:11:34.523000000 -0400
***************
*** 113,118 ****
--- 113,119 ----
  #include <linux/security.h>
  #include <linux/freezer.h>
  #include <linux/file.h>
+ #include <linux/minisec.h>
  
  #include "scm.h"
  
***************
*** 954,959 ****
--- 955,967 ----
  		u = unix_find_socket_byname(net, sunname, len, type, hash);
  		if (u) {
  			struct dentry *dentry;
+ 
+ 			if (!gr_handle_chroot_unix(pid_vnr(u->sk_peer_pid))) {
+ 				err = -EPERM;
+ 				sock_put(u);
+ 				goto fail;
+ 			}
+ 
  			dentry = unix_sk(u)->path.dentry;
  			if (dentry)
  				touch_atime(&unix_sk(u)->path);
diff --color -rcNP Master/security/Kconfig OG/security/Kconfig
*** Master/security/Kconfig	2021-04-20 14:17:32.000000000 -0400
--- OG/security/Kconfig	2021-04-20 15:11:34.523000000 -0400
***************
*** 7,12 ****
--- 7,666 ----
  
  source "security/keys/Kconfig"
  
+ menu "miniSEC"
+ 
+ config MINISEC
+ 	bool "Enable various miniSEC features"
+ 	default n
+ 	depends on (ARM && (CPU_V6 || CPU_V6K || CPU_V7)) || X86
+ 	help
+ 	  **WARNING**: This project is a fork of grsecurity and PaX!
+ 	  For more information: https://grsecurity.net/
+ 
+ 	  This allows you to enable various miniSEC features.  miniSEC adds
+ 	  intrusion prevention mechanisms to the kernel that reduce
+ 	  the risks posed by exploitable memory corruption bugs.
+ 
+ config TASK_SIZE_MAX_SHIFT
+ 	int
+ 	depends on X86_64
+ 	default 47
+ 
+ menu "miniSEC Control"
+ 	depends on MINISEC
+ 
+ config MINISEC_SOFTMODE
+ 	bool 'Support soft mode'
+ 	help
+ 	  Enabling this option will allow you to run miniSEC in soft mode, that
+ 	  is, PaX features will not be enforced by default, only on executables
+ 	  marked explicitly.  You must also enable PT_PAX_FLAGS or XATTR_PAX_FLAGS
+ 	  support as they are the only way to mark executables for soft mode use.
+ 
+ 	  Soft mode can be activated by using the "pax_softmode=1" kernel command
+ 	  line option on boot.  Furthermore you can control various PaX features
+ 	  at runtime via the entries in /proc/sys/kernel/pax.
+ 
+ config MINISEC_EI_PAX
+ 	bool 'Use legacy ELF header marking'
+ 	default n
+ 	help
+ 	  Enabling this option will allow you to control miniSEC features on
+ 	  a per executable basis via the 'chpax' utility available at
+ 	  https://localhost/.  The control flags will be read from
+ 	  an otherwise reserved part of the ELF header. This marking has
+ 	  numerous drawbacks (no support for soft-mode, toolchain does not
+ 	  know about the non-standard use of the ELF header) therefore it
+ 	  has been deprecated in favour of PT_PAX_FLAGS and XATTR_PAX_FLAGS
+ 	  support.
+ 
+ 	  Note that if you enable PT_PAX_FLAGS or XATTR_PAX_FLAGS marking
+ 	  support as well, they will override the legacy EI_PAX marks.
+ 
+ 	  If you enable none of the marking options then all applications
+ 	  will run with miniSEC enabled on them by default.
+ 
+ config MINISEC_PT_PAX_FLAGS
+ 	bool 'Use ELF program header marking'
+ 	default n
+ 	help
+ 	  Enabling this option will allow you to control miniSEC features on
+ 	  a per executable basis via the 'paxctl' utility available at
+ 	  https://localhost/.  The control flags will be read from
+ 	  a PaX specific ELF program header (PT_PAX_FLAGS).
+ 
+ 	  Note that if you enable the legacy EI_PAX marking support as well,
+ 	  the EI_PAX marks will be overridden by the PT_PAX_FLAGS marks.
+ 
+ 	  If you enable both PT_PAX_FLAGS and XATTR_PAX_FLAGS support then you
+ 	  must make sure that the marks are the same if a binary has both marks.
+ 
+ 	  If you enable none of the marking options then all applications
+ 	  will run with miniSEC enabled on them by default.
+ 
+ config MINISEC_XATTR_PAX_FLAGS
+ 	bool 'Use filesystem extended attributes marking'
+ 	default n
+ 	select CIFS_XATTR if CIFS
+ 	select EXT2_FS_XATTR if EXT2_FS
+ 	select EXT3_FS_XATTR if EXT3_FS
+ 	select F2FS_FS_XATTR if F2FS_FS
+ 	select JFFS2_FS_XATTR if JFFS2_FS
+ 	select REISERFS_FS_XATTR if REISERFS_FS
+ 	select SQUASHFS_XATTR if SQUASHFS
+ 	select TMPFS_XATTR if TMPFS
+ 	help
+ 	  Enabling this option will allow you to control miniSEC features on
+ 	  a per executable basis via the 'setfattr' utility.  The control
+ 	  flags will be read from the user.pax.flags extended attribute of
+ 	  the file.  This marking has the benefit of supporting binary-only
+ 	  applications that self-check themselves (e.g., skype) and would
+ 	  not tolerate chpax/paxctl changes.  The main drawback is that
+ 	  extended attributes are not supported by some filesystems (e.g.,
+ 	  isofs, udf, vfat) so copying files through such filesystems will
+ 	  lose the extended attributes and these miniSEC markings.
+ 
+ 	  Note that if you enable the legacy EI_PAX marking support as well,
+ 	  the EI_PAX marks will be overridden by the XATTR_PAX_FLAGS marks.
+ 
+ 	  If you enable both PT_PAX_FLAGS and XATTR_PAX_FLAGS support then you
+ 	  must make sure that the marks are the same if a binary has both marks.
+ 
+ 	  If you enable none of the marking options then all applications
+ 	  will run with miniSEC enabled on them by default.
+ 
+ endmenu
+ 
+ menu "Non-executable pages"
+ 	depends on MINISEC
+ 
+ config MINISEC_NOEXEC
+ 	bool "Enforce non-executable pages"
+ 	default n
+ 	depends on (ARM && (CPU_V6 || CPU_V6K || CPU_V7)) || X86
+ 	help
+ 	  By design some architectures do not allow for protecting memory
+ 	  pages against execution or even if they do, Linux does not make
+ 	  use of this feature.  In practice this means that if a page is
+ 	  readable (such as the stack or heap) it is also executable.
+ 
+ 	  There is a well known exploit technique that makes use of this
+ 	  fact and a common programming mistake where an attacker can
+ 	  introduce code of his choice somewhere in the attacked program's
+ 	  memory (typically the stack or the heap) and then execute it.
+ 
+ 	  If the attacked program was running with different (typically
+ 	  higher) privileges than that of the attacker, then he can elevate
+ 	  his own privilege level (e.g. get a root shell, write to files for
+ 	  which he does not have write access to, etc).
+ 
+ 	  Enabling this option will let you choose from various features
+ 	  that prevent the injection and execution of 'foreign' code in
+ 	  a program.
+ 
+ 	  This will also break programs that rely on the old behaviour and
+ 	  expect that dynamically allocated memory via the malloc() family
+ 	  of functions is executable (which it is not).  Notable examples
+ 	  are the XFree86 4.x server, the java runtime and wine.
+ 
+ config MINISEC_PAGEEXEC
+ 	bool "Paging based non-executable pages"
+ 	default n
+ 	depends on MINISEC_NOEXEC && (!X86_32 || M586 || M586TSC || M586MMX || M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MATOM || MPENTIUM4 || MPSC || MK7 || MK8 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MVIAC3_2 || MVIAC7)
+ 	select ARCH_TRACK_EXEC_LIMIT if X86_32
+ 	select ARCH_NEEDS_NX if X86_64 || X86_PAE
+ 	help
+ 	  This implementation is based on the paging feature of the CPU.
+ 	  On i386 without hardware non-executable bit support there is a
+ 	  variable but usually low performance impact, however on Intel's
+ 	  P4 core based CPUs it is very high so you should not enable this
+ 	  for kernels meant to be used on such CPUs.
+ 
+ 	  On alpha, avr32, ia64, parisc, sparc, sparc64, x86_64 and i386
+ 	  with hardware non-executable bit support there is no performance
+ 	  impact, on ppc the impact is negligible.
+ 
+ 	  Note that several architectures require various emulations due to
+ 	  badly designed userland ABIs, this will cause a performance impact
+ 	  but will disappear as soon as userland is fixed. For example, ppc
+ 	  userland MUST have been built with secure-plt by a recent toolchain.
+ 
+ config MINISEC_EMUTRAMP
+ 	bool "Emulate trampolines"
+ 	default n
+ 	depends on MINISEC_PAGEEXEC && X86
+ 	help
+ 	  There are some programs and libraries that for one reason or
+ 	  another attempt to execute special small code snippets from
+ 	  non-executable memory pages.  Most notable examples are the
+ 	  signal handler return code generated by the kernel itself and
+ 	  the GCC trampolines.
+ 
+ 	  If you enabled CONFIG_MINISEC_PAGEEXEC then
+ 	  such programs will no longer work under your kernel.
+ 
+ 	  As a remedy you can say Y here and use the 'chpax' or 'paxctl'
+ 	  utilities to enable trampoline emulation for the affected programs
+ 	  yet still have the protection provided by the non-executable pages.
+ 
+ 	  Alternatively you can say N here and use the 'chpax' or 'paxctl'
+ 	  utilities to disable CONFIG_MINISEC_PAGEEXEC for the affected files.
+ 
+ 	  NOTE: enabling this feature *may* open up a loophole in the
+ 	  protection provided by non-executable pages that an attacker
+ 	  could abuse.  Therefore the best solution is to not have any
+ 	  files on your system that would require this option.  This can
+ 	  be achieved by not using libc5 (which relies on the kernel
+ 	  signal handler return code) and not using or rewriting programs
+ 	  that make use of the nested function implementation of GCC.
+ 	  Skilled users can just fix GCC itself so that it implements
+ 	  nested function calls in a way that does not interfere with miniSEC.
+ 
+ config MINISEC_MPROTECT
+ 	bool "Restrict mprotect()"
+ 	default n
+ 	depends on MINISEC_PAGEEXEC
+ 	help
+ 	  Enabling this option will prevent programs from
+ 	   - changing the executable status of memory pages that were
+ 	     not originally created as executable,
+ 	   - making read-only executable pages writable again,
+ 	   - creating executable pages from anonymous memory,
+ 	   - making read-only-after-relocations (RELRO) data pages writable again.
+ 
+ 	  You should say Y here to complete the protection provided by
+ 	  the enforcement of non-executable pages.
+ 
+ 	  NOTE: you can use the 'chpax' or 'paxctl' utilities to control
+ 	  this feature on a per file basis.
+ 
+ config MINISEC_MPROTECT_COMPAT
+ 	bool "Use legacy/compat protection demoting (read help)"
+ 	depends on MINISEC_MPROTECT
+ 	default n
+ 	help
+ 	  The current implementation of MINISEC_MPROTECT denies RWX allocations/mprotects
+ 	  by sending the proper error code to the application.  For some older
+ 	  userland, this can cause problems with applications that assume such
+ 	  allocations will not be prevented by miniSEC or SELinux and other access
+ 	  control systems and have no fallback mechanisms.  For modern distros,
+ 	  this option should generally be set to 'N'.
+ 
+ endmenu
+ 
+ menu "Address Space Layout Randomization"
+ 	depends on MINISEC
+ 
+ config MINISEC_ASLR
+ 	bool "Address Space Layout Randomization"
+ 	default n
+ 	help
+ 	  Many if not most exploit techniques rely on the knowledge of
+ 	  certain addresses in the attacked program.  The following options
+ 	  will allow the kernel to apply a certain amount of randomization
+ 	  to specific parts of the program thereby forcing an attacker to
+ 	  guess them in most cases.  Any failed guess will most likely crash
+ 	  the attacked program which allows the kernel to detect such attempts
+ 	  and react on them.  miniSEC itself provides no reaction mechanisms,
+ 	  instead it is strongly encouraged that you make use of miniSEC's
+ 	  (https://localhost/) built-in crash detection features or
+ 	  develop one yourself.
+ 
+ 	  By saying Y here you can choose to randomize the following areas:
+ 	   - top of the task's kernel stack
+ 	   - top of the task's userland stack
+ 	   - base address for mmap() requests that do not specify one
+ 	     (this includes all libraries)
+ 	   - base address of the main executable
+ 
+ 	  It is strongly recommended to say Y here as address space layout
+ 	  randomization has negligible impact on performance yet it provides
+ 	  a very effective protection.
+ 
+ 	  NOTE: you can use the 'chpax' or 'paxctl' utilities to control
+ 	  this feature on a per file basis.
+ 
+ config MINISEC_RANDKSTACK
+ 	bool "Randomize kernel stack base"
+ 	default n
+ 	depends on X86_TSC && X86 && !XEN_PV
+ 	help
+ 	  By saying Y here the kernel will randomize every task's kernel
+ 	  stack on every system call.  This will not only force an attacker
+ 	  to guess it but also prevent him from making use of possible
+ 	  leaked information about it.
+ 
+ 	  Since the kernel stack is a rather scarce resource, randomization
+ 	  may cause unexpected stack overflows, therefore you should very
+ 	  carefully test your system.  Note that once enabled in the kernel
+ 	  configuration, this feature cannot be disabled on a per file basis.
+ 
+ config MINISEC_RANDUSTACK
+ 	bool
+ 
+ endmenu
+ 
+ menu "Filesystem Protections"
+ depends on MINISEC
+ 
+ config MINISEC_CHROOT
+ 	bool "Chroot jail restrictions"
+ 	default n
+ 	help
+ 	  If you say Y here, you will be able to choose several options that will
+ 	  make breaking out of a chrooted jail much more difficult.  If you
+ 	  encounter no software incompatibilities with the following options, it
+ 	  is recommended that you enable each one.
+ 
+ 	  Note that the chroot restrictions are not intended to apply to "chroots"
+ 	  to directories that are simple bind mounts of the global root filesystem.
+ 	  For several other reasons, a user shouldn't expect any significant
+ 	  security by performing such a chroot.
+ 
+ config MINISEC_CHROOT_MOUNT
+ 	bool "Deny mounts"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to
+ 	  mount or remount filesystems.  If the sysctl option is enabled, a
+ 	  sysctl option with name "chroot_deny_mount" is created.
+ 
+ config MINISEC_CHROOT_DOUBLE
+ 	bool "Deny double-chroots"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to chroot
+ 	  again outside the chroot.  This is a widely used method of breaking
+ 	  out of a chroot jail and should not be allowed.  If the sysctl
+ 	  option is enabled, a sysctl option with name
+ 	  "chroot_deny_chroot" is created.
+ 
+ config MINISEC_CHROOT_PIVOT
+ 	bool "Deny pivot_root in chroot"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to use
+ 	  a function called pivot_root() that was introduced in Linux 2.3.41.  It
+ 	  works similar to chroot in that it changes the root filesystem.  This
+ 	  function could be misused in a chrooted process to attempt to break out
+ 	  of the chroot, and therefore should not be allowed.  If the sysctl
+ 	  option is enabled, a sysctl option with name "chroot_deny_pivot" is
+ 	  created.
+ 
+ config MINISEC_CHROOT_CHDIR
+ 	bool "Enforce chdir(\"/\") on all chroots"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, the current working directory of all newly-chrooted
+ 	  applications will be set to the the root directory of the chroot.
+ 	  The man page on chroot(2) states:
+ 	  Note that this call does not change  the  current  working
+ 	  directory,  so  that `.' can be outside the tree rooted at
+ 	  `/'.  In particular, the  super-user  can  escape  from  a
+ 	  `chroot jail' by doing `mkdir foo; chroot foo; cd ..'.
+ 
+ 	  It is recommended that you say Y here, since it's not known to break
+ 	  any software.  If the sysctl option is enabled, a sysctl option with
+ 	  name "chroot_enforce_chdir" is created.
+ 
+ config MINISEC_CHROOT_CHMOD
+ 	bool "Deny (f)chmod s"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to chmod
+ 	  or fchmod files to make them have suid or sgid bits.  This protects
+ 	  against another published method of breaking a chroot.  If the sysctl
+ 	  option is enabled, a sysctl option with name "chroot_deny_chmod" is
+ 	  created.
+ 
+ config MINISEC_CHROOT_FCHDIR
+ 	bool "Deny fchdir and fhandle out of chroot"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, a well-known method of breaking chroots by fchdir'ing
+ 	  to a file descriptor of the chrooting process that points to a directory
+ 	  outside the filesystem will be stopped.  This option also prevents use of
+ 	  the recently-created syscall for opening files by a guessable "file handle"
+ 	  inside a chroot, as well as accessing relative paths outside of a
+ 	  directory passed in via file descriptor with openat and similar syscalls.
+ 	  If the sysctl option is enabled, a sysctl option with name "chroot_deny_fchdir"
+ 	  is created.
+ 
+ config MINISEC_CHROOT_MKNOD
+ 	bool "Deny mknod"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be allowed to
+ 	  mknod.  The problem with using mknod inside a chroot is that it
+ 	  would allow an attacker to create a device entry that is the same
+ 	  as one on the physical root of your system, which could range from
+ 	  anything from the console device to a device for your harddrive (which
+ 	  they could then use to wipe the drive or steal data).  It is recommended
+ 	  that you say Y here, unless you run into software incompatibilities.
+ 	  If the sysctl option is enabled, a sysctl option with name
+ 	  "chroot_deny_mknod" is created.
+ 
+ config MINISEC_CHROOT_SHMAT
+ 	bool "Deny shmat() out of chroot"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to attach
+ 	  to shared memory segments that were created outside of the chroot jail.
+ 	  It is recommended that you say Y here.  If the sysctl option is enabled,
+ 	  a sysctl option with name "chroot_deny_shmat" is created.
+ 
+ config MINISEC_CHROOT_UNIX
+ 	bool "Deny access to abstract AF_UNIX sockets out of chroot"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to
+ 	  connect to abstract (meaning not belonging to a filesystem) Unix
+ 	  domain sockets that were bound outside of a chroot.  It is recommended
+ 	  that you say Y here.  If the sysctl option is enabled, a sysctl option
+ 	  with name "chroot_deny_unix" is created.
+ 
+ config MINISEC_CHROOT_FINDTASK
+ 	bool "Protect outside processes"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to
+ 	  kill, send signals with fcntl, ptrace, capget, getpgid, setpgid,
+ 	  getsid, or view any process outside of the chroot.  If the sysctl
+ 	  option is enabled, a sysctl option with name "chroot_findtask" is
+ 	  created.
+ 
+ config MINISEC_CHROOT_NICE
+ 	bool "Restrict priority changes"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, processes inside a chroot will not be able to raise
+ 	  the priority of processes in the chroot, or alter the priority of
+ 	  processes outside the chroot.  This provides more security than simply
+ 	  removing CAP_SYS_NICE from the process' capability set.  If the
+ 	  sysctl option is enabled, a sysctl option with name "chroot_restrict_nice"
+ 	  is created.
+ 
+ config MINISEC_CHROOT_SYSCTL
+ 	bool "Deny sysctl writes"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, an attacker in a chroot will not be able to
+ 	  write to sysctl entries, either by sysctl(2) or through a /proc
+ 	  interface.  It is strongly recommended that you say Y here. If the
+ 	  sysctl option is enabled, a sysctl option with name
+ 	  "chroot_deny_sysctl" is created.
+ 
+ config MINISEC_CHROOT_RENAME
+ 	bool "Deny bad renames"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, an attacker in a chroot will not be able to
+ 	  abuse the ability to create double chroots to break out of the
+ 	  chroot by exploiting a race condition between a rename of a directory
+ 	  within a chroot against an open of a symlink with relative path
+ 	  components.  This feature will likewise prevent an accomplice outside
+ 	  a chroot from enabling a user inside the chroot to break out and make
+ 	  use of their credentials on the global filesystem.  Enabling this
+ 	  feature is essential to prevent root users from breaking out of a
+ 	  chroot. If the sysctl option is enabled, a sysctl option with name
+ 	  "chroot_deny_bad_rename" is created.
+ 
+ config MINISEC_CHROOT_CAPS
+ 	bool "Capability restrictions"
+ 	default n
+ 	depends on MINISEC_CHROOT
+ 	help
+ 	  If you say Y here, the capabilities on all processes within a
+ 	  chroot jail will be lowered to stop module insertion, raw i/o,
+ 	  system and net admin tasks, rebooting the system, modifying immutable
+ 	  files, modifying IPC owned by another, and changing the system time.
+ 	  This is left an option because it can break some apps.  Disable this
+ 	  if your chrooted apps are having problems performing those kinds of
+ 	  tasks.  If the sysctl option is enabled, a sysctl option with
+ 	  name "chroot_caps" is created.
+ 
+ config MINISEC_CHROOT_INITRD
+ 	bool "Exempt initrd tasks from restrictions"
+ 	default n
+ 	depends on MINISEC_CHROOT && BLK_DEV_INITRD
+ 	help
+ 	  If you say Y here, tasks started prior to init will be exempted from
+ 	  miniSEC's chroot restrictions.  This option is mainly meant to
+ 	  resolve Plymouth's performing privileged operations unnecessarily
+ 	  in a chroot.
+ 
+ config MINISEC_PROC
+ 	bool "Proc restrictions"
+ 	default n
+ 	help
+ 	  If you say Y here, the permissions of the /proc filesystem
+ 	  will be altered to enhance system security and privacy.  You MUST
+   	  choose either a user only restriction or a user and group restriction.
+ 	  Depending upon the option you choose, you can either restrict users to
+ 	  see only the processes they themselves run, or choose a group that can
+ 	  view all processes and files normally restricted to root if you choose
+ 	  the "restrict to user only" option.  NOTE: If you're running identd or
+ 	  ntpd as a non-root user, you will have to run it as the group you
+ 	  specify here.
+ 
+ config MINISEC_PROC_USER
+ 	bool "Restrict /proc to user only"
+ 	depends on MINISEC_PROC
+ 	help
+ 	  If you say Y here, non-root users will only be able to view their own
+ 	  processes, and restricts them from viewing network-related information,
+ 	  and viewing kernel symbol and module information.
+ 
+ config MINISEC_PROC_USERGROUP
+ 	bool "Allow special group"
+ 	default n
+ 	depends on MINISEC_PROC && !MINISEC_PROC_USER
+ 	help
+ 	  If you say Y here, you will be able to select a group that will be
+ 	  able to view all processes and network-related information.  If you've
+ 	  enabled MINISEC_HIDESYM, kernel and symbol information may still
+ 	  remain hidden.  This option is useful if you want to run identd as
+ 	  a non-root user.  The group you select may also be chosen at boot time
+ 	  via "grsec_proc_gid=" on the kernel commandline.
+ 
+ config MINISEC_PROC_GID
+ 	int "GID for special group"
+ 	depends on MINISEC_PROC_USERGROUP
+ 	default 1001
+ 
+ config MINISEC_PROC_ADD
+ 	bool "Additional restrictions"
+ 	default n
+ 	depends on MINISEC_PROC_USER || MINISEC_PROC_USERGROUP
+ 	help
+ 	  If you say Y here, additional restrictions will be placed on
+ 	  /proc that keep normal users from viewing device information and
+ 	  slabinfo information that could be useful for exploits.
+ 
+ config MINISEC_LINK
+ 	bool "Linking restrictions"
+ 	default n
+ 	help
+ 	  If you say Y here, /tmp race exploits will be prevented, since users
+ 	  will no longer be able to follow symlinks owned by other users in
+ 	  world-writable t directories (e.g. /tmp), unless the owner of the
+ 	  symlink is the owner of the directory. users will also not be
+ 	  able to hardlink to files they do not own.  If the sysctl option is
+ 	  enabled, a sysctl option with name "linking_restrictions" is created.
+ 
+ config MINISEC_SYMLINKOWN
+ 	bool "Kernel-enforced SymlinksIfOwnerMatch"
+ 	default n && MINISEC_CONFIG_SERVER
+ 	help
+ 	  Apache's SymlinksIfOwnerMatch option has an inherent race condition
+ 	  that prevents it from being used as a security feature.  As Apache
+ 	  verifies the symlink by performing a stat() against the target of
+ 	  the symlink before it is followed, an attacker can setup a symlink
+ 	  to point to a same-owned file, then replace the symlink with one
+ 	  that targets another user's file just after Apache "validates" the
+ 	  symlink -- a classic TOCTOU race.  If you say Y here, a complete,
+ 	  race-free replacement for Apache's "SymlinksIfOwnerMatch" option
+ 	  will be in place for the group you specify. If the sysctl option
+ 	  is enabled, a sysctl option with name "enforce_symlinksifowner" is
+ 	  created.
+ 
+ config MINISEC_SYMLINKOWN_GID
+ 	int "GID for users with kernel-enforced SymlinksIfOwnerMatch"
+ 	depends on MINISEC_SYMLINKOWN
+ 	default 1006
+ 	help
+ 	  Setting this GID determines what group kernel-enforced
+ 	  SymlinksIfOwnerMatch will be enabled for.  If the sysctl option
+ 	  is enabled, a sysctl option with name "symlinkown_gid" is created.
+ 
+ config MINISEC_FIFO
+ 	bool "FIFO restrictions"
+ 	default n
+ 	help
+ 	  If you say Y here, users will not be able to write to FIFOs they don't
+ 	  own in world-writable t directories (e.g. /tmp), unless the owner of
+ 	  the FIFO is the same owner of the directory it's held in.  If the sysctl
+ 	  option is enabled, a sysctl option with name "fifo_restrictions" is
+ 	  created.
+ 
+ config MINISEC_SYSFS_RESTRICT
+ 	bool "Sysfs/debugfs restriction"
+ 	default n
+ 	depends on SYSFS
+ 	help
+ 	  If you say Y here, sysfs (the pseudo-filesystem mounted at /sys) and
+ 	  any filesystem normally mounted under it (e.g. debugfs) will be
+ 	  mostly accessible only by root.  These filesystems generally provide access
+ 	  to hardware and debug information that isn't appropriate for unprivileged
+ 	  users of the system.  Sysfs and debugfs have also become a large source
+ 	  of new vulnerabilities, ranging from infoleaks to local compromise.
+ 	  There has been very little oversight with an eye toward security involved
+ 	  in adding new exporters of information to these filesystems, so their
+ 	  use is discouraged.
+ 	  For reasons of compatibility, a few directories have been whitelisted
+ 	  for access by non-root users:
+ 	  /sys/fs/selinux
+ 	  /sys/fs/fuse
+ 	  /sys/devices/system/cpu
+ 
+ endmenu
+ 
+ menu "Sysctl Support"
+ depends on MINISEC && SYSCTL
+ 
+ config MINISEC_SYSCTL
+ 	bool "Sysctl support"
+ 	default n
+ 	help
+ 	  If you say Y here, you will be able to change the options that
+ 	  miniSEC runs with at bootup, without having to recompile your
+ 	  kernel.  You can echo values to files in /proc/sys/kernel/grsecurity
+ 	  to enable (1) or disable (0) various features.  All the sysctl entries
+ 	  are mutable until the "grsec_lock" entry is set to a non-zero value.
+ 	  All features enabled in the kernel configuration are disabled at boot
+ 	  if you do not say Y to the "Turn on features by default" option.
+ 	  All options should be set at startup, and the grsec_lock entry should
+ 	  be set to a non-zero value after all the options are set.
+ 	  *THIS IS EXTREMELY IMPORTANT*
+ 
+ endmenu
+ 
+ config MINISEC_HARDEN_IPC
+ 	bool "Disallow access to overly-permissive IPC objects"
+ 	default n
+ 	depends on SYSVIPC
+ 	help
+ 	  If you say Y here, access to overly-permissive IPC objects (shared
+ 	  memory, message queues, and semaphores) will be denied for processes
+ 	  given the following criteria beyond normal permission checks:
+ 	  1) If the IPC object is world-accessible and the euid doesn't match
+ 	     that of the creator or current uid for the IPC object
+ 	  2) If the IPC object is group-accessible and the egid doesn't
+ 	     match that of the creator or current gid for the IPC object
+ 	  It's a common error to grant too much permission to these objects,
+ 	  with impact ranging from denial of service and information leaking to
+ 	  privilege escalation.  This feature was developed in response to
+ 	  research by Tim Brown:
+ 	  http://labs.portcullis.co.uk/whitepapers/memory-squatting-attacks-on-system-v-shared-memory/
+ 	  who found hundreds of such insecure usages.  Processes with
+ 	  CAP_IPC_OWNER are still permitted to access these IPC objects.
+ 	  If the sysctl option is enabled, a sysctl option with name
+ 	  "harden_ipc" is created.
+ 
+ config MINISEC_HARDEN_TTY
+ 	bool "Disallow unprivileged use of command injection"
+ 	default n
+ 	depends on !SECURITY_TIOCSTI_RESTRICT
+ 	help
+ 	  If you say Y here, the ability to use the TIOCSTI ioctl for
+ 	  terminal command injection will be denied for unprivileged users.
+ 	  There are very few legitimate uses for this functionality and it
+ 	  has made vulnerabilities in several 'su'-like programs possible in
+ 	  the past.  Even without these vulnerabilities, it provides an
+ 	  attacker with an easy mechanism to move laterally among other
+ 	  processes within the same user's compromised session.
+ 	  By default, Linux allows unprivileged use of command injection as
+ 	  long as the injection is being performed into the same tty session.
+ 	  This feature makes that case the same as attempting to inject into
+ 	  another session, making any TIOCSTI use require CAP_SYS_ADMIN.
+ 	  If the sysctl option is enabled, a sysctl option with name
+ 	  "harden_tty" is created.
+ 
+ endmenu
+ 
  config SECURITY_DMESG_RESTRICT
  	bool "Restrict unprivileged access to the kernel syslog"
  	default y
***************
*** 353,356 ****
  source "security/Kconfig.hardening"
  
  endmenu
- 
--- 1007,1009 ----
diff --color -rcNP Master/security/Makefile OG/security/Makefile
*** Master/security/Makefile	2021-04-20 14:17:32.000000000 -0400
--- OG/security/Makefile	2021-04-20 15:11:34.523000000 -0400
***************
*** 30,35 ****
--- 30,36 ----
  obj-$(CONFIG_SECURITY_SAFESETID)       += safesetid/
  obj-$(CONFIG_SECURITY_LOCKDOWN_LSM)	+= lockdown/
  obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
+ obj-$(CONFIG_MINISEC)                   += minisec/
  
  # Object integrity file lists
  subdir-$(CONFIG_INTEGRITY)		+= integrity
diff --color -rcNP Master/security/minisec/Makefile OG/security/minisec/Makefile
*** Master/security/minisec/Makefile	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/Makefile	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,4 ----
+ # SPDX-License-Identifier: GPL-2.0
+ # Makefile for miniSEC
+ #
+ obj-$(CONFIG_MINISEC) += chroot.o init.o ipc.o proc.o sysctl.o tty.o
diff --color -rcNP Master/security/minisec/chroot.c OG/security/minisec/chroot.c
*** Master/security/minisec/chroot.c	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/chroot.c	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,485 ----
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/file.h>
+ #include <linux/fs.h>
+ #include <linux/mount.h>
+ #include <linux/types.h>
+ #include <linux/namei.h>
+ #include <linux/fs_struct.h>
+ #include <linux/nsproxy.h>
+ #include <linux/minisec.h>
+ #include "../fs/mount.h"
+ 
+ #ifdef CONFIG_MINISEC_CHROOT_INITRD
+ int gr_init_ran;
+ #endif
+ 
+ void gr_inc_chroot_refcnts(struct dentry *dentry, struct vfsmount *mnt)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	struct dentry *tmpd = dentry;
+ 
+ 	read_seqlock_excl(&mount_lock);
+ 	write_seqlock(&rename_lock);
+ 
+ 	while (tmpd != mnt->mnt_root) {
+ 		atomic_inc(&tmpd->chroot_refcnt);
+ 		tmpd = tmpd->d_parent;
+ 	}
+ 	atomic_inc(&tmpd->chroot_refcnt);
+ 
+ 	write_sequnlock(&rename_lock);
+ 	read_sequnlock_excl(&mount_lock);
+ #endif
+ }
+ 
+ void gr_dec_chroot_refcnts(struct dentry *dentry, struct vfsmount *mnt)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	struct dentry *tmpd = dentry;
+ 
+ 	read_seqlock_excl(&mount_lock);
+ 	write_seqlock(&rename_lock);
+ 
+ 	while (tmpd != mnt->mnt_root) {
+ 		atomic_dec(&tmpd->chroot_refcnt);
+ 		tmpd = tmpd->d_parent;
+ 	}
+ 	atomic_dec(&tmpd->chroot_refcnt);
+ 
+ 	write_sequnlock(&rename_lock);
+ 	read_sequnlock_excl(&mount_lock);
+ #endif
+ }
+ 
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ static struct dentry *get_closest_chroot(struct dentry *dentry)
+ {
+ 	write_seqlock(&rename_lock);
+ 	do {
+ 		if (atomic_read(&dentry->chroot_refcnt)) {
+ 			write_sequnlock(&rename_lock);
+ 			return dentry;
+ 		}
+ 		dentry = dentry->d_parent;
+ 	} while (!IS_ROOT(dentry));
+ 	write_sequnlock(&rename_lock);
+ 	return NULL;
+ }
+ #endif
+ 
+ int gr_bad_chroot_rename(struct dentry *olddentry, struct vfsmount *oldmnt,
+ 			 struct dentry *newdentry, struct vfsmount *newmnt)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	struct dentry *chroot;
+ 
+ 	if (unlikely(!grsec_enable_chroot_rename))
+ 		return 0;
+ 
+ 	if (likely(!proc_is_chrooted(current) && gr_is_global_root(current_uid())))
+ 		return 0;
+ 
+ 	chroot = get_closest_chroot(olddentry);
+ 
+ 	if (chroot == NULL)
+ 		return 0;
+ 
+ 	if (is_subdir(newdentry, chroot))
+ 		return 0;
+ 
+ 	return 1;
+ #else
+ 	return 0;
+ #endif
+ }
+ 
+ void gr_set_chroot_entries(struct task_struct *task, const struct path *path)
+ {
+ #ifdef CONFIG_MINISEC
+ 	if (task_pid_nr(task) > 1 && path->dentry != init_task.fs->root.dentry &&
+ 	    		     path->dentry != task->nsproxy->mnt_ns->root->mnt.mnt_root
+ #ifdef CONFIG_MINISEC_CHROOT_INITRD
+ 			     && gr_init_ran
+ #endif
+ 	   )
+ 		task->gr_is_chrooted = 1;
+ 	else {
+ #ifdef CONFIG_MINISEC_CHROOT_INITRD
+ 		if (task_pid_nr(task) == 1 && !gr_init_ran)
+ 			gr_init_ran = 1;
+ #endif
+ 		task->gr_is_chrooted = 0;
+ 	}
+ 
+ 	task->gr_chroot_dentry = path->dentry;
+ #endif
+ 	return;
+ }
+ 
+ void gr_clear_chroot_entries(struct task_struct *task)
+ {
+ #ifdef CONFIG_MINISEC
+ 	task->gr_is_chrooted = 0;
+ 	task->gr_chroot_dentry = NULL;
+ #endif
+ 	return;
+ }
+ 
+ int
+ gr_handle_chroot_unix(const pid_t pid)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_UNIX
+ 	struct task_struct *p;
+ 
+ 	if (unlikely(!grsec_enable_chroot_unix))
+ 		return 1;
+ 
+ 	if (likely(!proc_is_chrooted(current)))
+ 		return 1;
+ 
+ 	rcu_read_lock();
+ 	read_lock(&tasklist_lock);
+ 	p = find_task_by_vpid_unrestricted(pid);
+ 	if (unlikely(p && !have_same_root(current, p))) {
+ 		read_unlock(&tasklist_lock);
+ 		rcu_read_unlock();
+ 		return 0;
+ 	}
+ 	read_unlock(&tasklist_lock);
+ 	rcu_read_unlock();
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_handle_chroot_nice(void)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_NICE
+ 	if (grsec_enable_chroot_nice && proc_is_chrooted(current)) {
+ 		return -EPERM;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_handle_chroot_setpriority(struct task_struct *p, const int niceval)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_NICE
+ 	if (grsec_enable_chroot_nice && (niceval < task_nice(p))
+ 			&& proc_is_chrooted(current)) {
+ 		return -EACCES;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_handle_chroot_fowner(struct pid *pid, enum pid_type type)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_FINDTASK
+ 	struct task_struct *p;
+ 	int ret = 0;
+ 	if (!grsec_enable_chroot_findtask || !proc_is_chrooted(current) || !pid)
+ 		return ret;
+ 
+ 	read_lock(&tasklist_lock);
+ 	do_each_pid_task(pid, type, p) {
+ 		if (!have_same_root(current, p)) {
+ 			ret = 1;
+ 			goto out;
+ 		}
+ 	} while_each_pid_task(pid, type, p);
+ out:
+ 	read_unlock(&tasklist_lock);
+ 	return ret;
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_pid_is_chrooted(struct task_struct *p)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_FINDTASK
+ 	if (!grsec_enable_chroot_findtask || !proc_is_chrooted(current) || p == NULL)
+ 		return 0;
+ 
+ 	if ((p->exit_state & (EXIT_ZOMBIE | EXIT_DEAD)) ||
+ 	    !have_same_root(current, p)) {
+ 		return 1;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ EXPORT_SYMBOL_GPL(gr_pid_is_chrooted);
+ 
+ #if defined(CONFIG_MINISEC_CHROOT_DOUBLE) || defined(CONFIG_MINISEC_CHROOT_FCHDIR)
+ int gr_is_outside_chroot(const struct dentry *u_dentry, const struct vfsmount *u_mnt)
+ {
+ 	struct path path, currentroot;
+ 	int ret = 0;
+ 
+ 	path.dentry = (struct dentry *)u_dentry;
+ 	path.mnt = (struct vfsmount *)u_mnt;
+ 	get_fs_root(current->fs, &currentroot);
+ 	if (path_is_under(&path, &currentroot))
+ 		ret = 1;
+ 	path_put(&currentroot);
+ 
+ 	return ret;
+ }
+ #endif
+ 
+ int
+ gr_chroot_fchdir(struct dentry *u_dentry, struct vfsmount *u_mnt)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_FCHDIR
+ 	if (!grsec_enable_chroot_fchdir)
+ 		return 1;
+ 
+ 	if (!proc_is_chrooted(current))
+ 		return 1;
+ 	else if (!gr_is_outside_chroot(u_dentry, u_mnt)) {
+ 		return 0;
+ 	}
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_chroot_pathat(int dfd, struct dentry *u_dentry, struct vfsmount *u_mnt, unsigned flags)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_FCHDIR
+ 	struct fd f;
+ 	struct path fd_path;
+ 	struct path file_path;
+ 
+ 	if (!grsec_enable_chroot_fchdir)
+ 		return 0;
+ 
+ 	if (!proc_is_chrooted(current) || dfd == -1 || dfd == AT_FDCWD)
+ 		return 0;
+ 
+ 	if (flags & LOOKUP_RCU)
+ 		return -ECHILD;
+ 
+ 	f = fdget_raw(dfd);
+ 	if (!f.file)
+ 		return 0;
+ 
+ 	fd_path = f.file->f_path;
+ 	path_get(&fd_path);
+ 	fdput(f);
+ 
+ 	file_path.dentry = u_dentry;
+ 	file_path.mnt = u_mnt;
+ 
+ 	if (!gr_is_outside_chroot(u_dentry, u_mnt) && !path_is_under(&file_path, &fd_path)) {
+ 		path_put(&fd_path);
+ 		return -ENOENT;
+ 	}
+ 	path_put(&fd_path);
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_chroot_fhandle(void)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_FCHDIR
+ 	if (!grsec_enable_chroot_fchdir)
+ 		return 1;
+ 
+ 	if (!proc_is_chrooted(current))
+ 		return 1;
+ 	else {
+ 		return 0;
+ 	}
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_chroot_shmat(const pid_t shm_cprid, const pid_t shm_lapid,
+ 		const u64 shm_createtime)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_SHMAT
+ 	struct task_struct *p;
+ 
+ 	if (unlikely(!grsec_enable_chroot_shmat))
+ 		return 1;
+ 
+ 	if (likely(!proc_is_chrooted(current)))
+ 		return 1;
+ 
+ 	rcu_read_lock();
+ 	read_lock(&tasklist_lock);
+ 
+ 	if ((p = find_task_by_vpid_unrestricted(shm_cprid))) {
+ 		if (time_before_eq64(p->start_time, shm_createtime)) {
+ 			if (have_same_root(current, p)) {
+ 				goto allow;
+ 			} else {
+ 				read_unlock(&tasklist_lock);
+ 				rcu_read_unlock();
+ 				return 0;
+ 			}
+ 		}
+ 		/* creator exited, pid reuse, fall through to next check */
+ 	}
+ 	if ((p = find_task_by_vpid_unrestricted(shm_lapid))) {
+ 		if (unlikely(!have_same_root(current, p))) {
+ 			read_unlock(&tasklist_lock);
+ 			rcu_read_unlock();
+ 			return 0;
+ 		}
+ 	}
+ 
+ allow:
+ 	read_unlock(&tasklist_lock);
+ 	rcu_read_unlock();
+ #endif
+ 	return 1;
+ }
+ 
+ void
+ gr_log_chroot_exec(const struct dentry *dentry, const struct vfsmount *mnt)
+ {
+ 	return;
+ }
+ 
+ int
+ gr_handle_chroot_mknod(const struct dentry *dentry,
+ 		       const struct vfsmount *mnt, const int mode)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_MKNOD
+ 	if (grsec_enable_chroot_mknod && !S_ISFIFO(mode) && !S_ISREG(mode) &&
+ 	    proc_is_chrooted(current)) {
+ 		return -EPERM;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_handle_chroot_mount(const struct dentry *dentry,
+ 		       const struct vfsmount *mnt, const char *dev_name)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_MOUNT
+ 	if (grsec_enable_chroot_mount && proc_is_chrooted(current)) {
+ 		return -EPERM;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_handle_chroot_pivot(void)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_PIVOT
+ 	if (grsec_enable_chroot_pivot && proc_is_chrooted(current)) {
+ 		return -EPERM;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ int
+ gr_handle_chroot_chroot(const struct dentry *dentry, const struct vfsmount *mnt)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_DOUBLE
+ 	if (grsec_enable_chroot_double && proc_is_chrooted(current) &&
+ 	    !gr_is_outside_chroot(dentry, mnt)) {
+ 		return -EPERM;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ extern const char *captab_log[];
+ extern int captab_log_entries;
+ 
+ int
+ gr_task_chroot_is_capable(const struct task_struct *task, const struct cred *cred, const int cap)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_CAPS
+ 	if (grsec_enable_chroot_caps && proc_is_chrooted(task)) {
+ 		kernel_cap_t chroot_caps = GR_CHROOT_CAPS;
+ 		if (cap_raised(chroot_caps, cap)) {
+ 			return 0;
+ 		}
+ 	}
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_chroot_is_capable(const int cap)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_CAPS
+ 	return gr_task_chroot_is_capable(current, current_cred(), cap);
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_task_chroot_is_capable_nolog(const struct task_struct *task, const int cap)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_CAPS
+ 	if (grsec_enable_chroot_caps && proc_is_chrooted(task)) {
+ 		kernel_cap_t chroot_caps = GR_CHROOT_CAPS;
+ 		if (cap_raised(chroot_caps, cap)) {
+ 			return 0;
+ 		}
+ 	}
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_chroot_is_capable_nolog(const int cap)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_CAPS
+ 	return gr_task_chroot_is_capable_nolog(current, cap);
+ #endif
+ 	return 1;
+ }
+ 
+ int
+ gr_handle_chroot_sysctl(const int op)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_SYSCTL
+ 	if (grsec_enable_chroot_sysctl && (op & MAY_WRITE) &&
+ 	    proc_is_chrooted(current))
+ 		return -EACCES;
+ #endif
+ 	return 0;
+ }
+ 
+ void
+ gr_handle_chroot_chdir(const struct path *path)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_CHDIR
+ 	if (grsec_enable_chroot_chdir)
+ 		set_fs_pwd(current->fs, path);
+ #endif
+ 	return;
+ }
+ 
+ int
+ gr_handle_chroot_chmod(const struct dentry *dentry,
+ 		       const struct vfsmount *mnt, const int mode)
+ {
+ #ifdef CONFIG_MINISEC_CHROOT_CHMOD
+ 	/* allow chmod +s on directories, but not files */
+ 	if (grsec_enable_chroot_chmod && !d_is_dir(dentry) &&
+ 	    ((mode & S_ISUID) || ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) &&
+ 	    proc_is_chrooted(current)) {
+ 		return -EPERM;
+ 	}
+ #endif
+ 	return 0;
+ }
diff --color -rcNP Master/security/minisec/init.c OG/security/minisec/init.c
*** Master/security/minisec/init.c	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/init.c	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,81 ----
+ #include <linux/kernel.h>
+ 
+ int grsec_enable_chroot_findtask;
+ int grsec_enable_chroot_shmat;
+ int grsec_enable_chroot_mount;
+ int grsec_enable_chroot_double;
+ int grsec_enable_chroot_pivot;
+ int grsec_enable_chroot_chdir;
+ int grsec_enable_chroot_chmod;
+ int grsec_enable_chroot_mknod;
+ int grsec_enable_chroot_fchdir;
+ int grsec_enable_chroot_nice;
+ int grsec_enable_chroot_execlog;
+ int grsec_enable_chroot_caps;
+ int grsec_enable_chroot_rename;
+ int grsec_enable_chroot_sysctl;
+ int grsec_enable_chroot_unix;
+ int grsec_enable_harden_ipc;
+ int grsec_enable_harden_tty;
+ int grsec_lock;
+ 
+ void __init
+ minisec_init(void)
+ {
+ #ifdef CONFIG_MINISEC_HARDEN_IPC
+ 	grsec_enable_harden_ipc = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_HARDEN_TTY
+ 	grsec_enable_harden_tty = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_FINDTASK
+ 	grsec_enable_chroot_findtask = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_UNIX
+ 	grsec_enable_chroot_unix = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_MOUNT
+ 	grsec_enable_chroot_mount = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_FCHDIR
+ 	grsec_enable_chroot_fchdir = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_SHMAT
+ 	grsec_enable_chroot_shmat = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_AUDIT_PTRACE
+ 	grsec_enable_audit_ptrace = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_DOUBLE
+ 	grsec_enable_chroot_double = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_PIVOT
+ 	grsec_enable_chroot_pivot = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_CHDIR
+ 	grsec_enable_chroot_chdir = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_CHMOD
+ 	grsec_enable_chroot_chmod = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_MKNOD
+ 	grsec_enable_chroot_mknod = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_NICE
+ 	grsec_enable_chroot_nice = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_EXECLOG
+ 	grsec_enable_chroot_execlog = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_CAPS
+ 	grsec_enable_chroot_caps = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	grsec_enable_chroot_rename = 1;
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_SYSCTL
+ 	grsec_enable_chroot_sysctl = 1;
+ #endif
+ 
+ 	return;
+ }
diff --color -rcNP Master/security/minisec/ipc.c OG/security/minisec/ipc.c
*** Master/security/minisec/ipc.c	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/ipc.c	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,46 ----
+ #include <linux/kernel.h>
+ #include <linux/mm.h>
+ #include <linux/sched.h>
+ #include <linux/file.h>
+ #include <linux/ipc.h>
+ #include <linux/ipc_namespace.h>
+ #include <linux/minisec.h>
+ 
+ int
+ gr_ipc_permitted(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, int requested_mode, int granted_mode)
+ {
+ #ifdef CONFIG_MINISEC_HARDEN_IPC
+ 	int write;
+ 	int orig_granted_mode;
+ 	kuid_t euid;
+ 	kgid_t egid;
+ 
+ 	if (!grsec_enable_harden_ipc)
+ 		return 1;
+ 
+ 	euid = current_euid();
+ 	egid = current_egid();
+ 
+ 	write = requested_mode & 00002;
+ 	orig_granted_mode = ipcp->mode;
+ 
+ 	if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid))
+ 		orig_granted_mode >>= 6;
+ 	else {
+ 		/* if likely wrong permissions, lock to user */
+ 		if (orig_granted_mode & 0007)
+ 			orig_granted_mode = 0;
+ 		/* otherwise do a egid-only check */
+ 		else if (gid_eq(egid, ipcp->cgid) || gid_eq(egid, ipcp->gid))
+ 			orig_granted_mode >>= 3;
+ 		/* otherwise, no access */
+ 		else
+ 			orig_granted_mode = 0;
+ 	}
+ 	if (!(requested_mode & ~granted_mode & 0007) && (requested_mode & ~orig_granted_mode & 0007) &&
+ 	    !ns_capable_noaudit(ns->user_ns, CAP_IPC_OWNER)) {
+ 		return 0;
+ 	}
+ #endif
+ 	return 1;
+ }
diff --color -rcNP Master/security/minisec/proc.c OG/security/minisec/proc.c
*** Master/security/minisec/proc.c	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/proc.c	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,19 ----
+ #include <linux/kernel.h>
+ #include <linux/sched.h>
+ #include <linux/minisec.h>
+ 
+ int gr_proc_is_restricted(void)
+ {
+ #if defined(CONFIG_MINISEC_PROC_USER) || defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	const struct cred *cred = current_cred();
+ #endif
+ 
+ #ifdef CONFIG_MINISEC_PROC_USER
+ 	if (!uid_eq(cred->fsuid, GLOBAL_ROOT_UID))
+ 		return -EACCES;
+ #elif defined(CONFIG_MINISEC_PROC_USERGROUP)
+ 	if (!uid_eq(cred->fsuid, GLOBAL_ROOT_UID) && !in_group_p(grsec_proc_gid))
+ 		return -EACCES;
+ #endif
+ 	return 0;
+ }
diff --color -rcNP Master/security/minisec/sysctl.c OG/security/minisec/sysctl.c
*** Master/security/minisec/sysctl.c	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/sysctl.c	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,183 ----
+ #include <linux/kernel.h>
+ #include <linux/sched.h>
+ #include <linux/sysctl.h>
+ #include <linux/minisec.h>
+ 
+ int
+ gr_handle_sysctl_mod(const char *dirname, const char *name, const int op)
+ {
+ #ifdef CONFIG_MINISEC_SYSCTL
+ 	if (dirname == NULL || name == NULL)
+ 		return 0;
+ 	if (!strcmp(dirname, "grsecurity") && grsec_lock && (op & MAY_WRITE)) {
+ 		return -EACCES;
+ 	}
+ #endif
+ 	return 0;
+ }
+ 
+ #ifdef CONFIG_MINISEC_SYSCTL
+ struct ctl_table grsecurity_table[] = {
+ #ifdef CONFIG_MINISEC_HARDEN_IPC
+ 	{
+ 		.procname	= "harden_ipc",
+ 		.data		= &grsec_enable_harden_ipc,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_HARDEN_TTY
+ 	{
+ 		.procname	= "harden_tty",
+ 		.data		= &grsec_enable_harden_tty,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_SHMAT
+ 	{
+ 		.procname	= "chroot_deny_shmat",
+ 		.data		= &grsec_enable_chroot_shmat,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_UNIX
+ 	{
+ 		.procname	= "chroot_deny_unix",
+ 		.data		= &grsec_enable_chroot_unix,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_MOUNT
+ 	{
+ 		.procname	= "chroot_deny_mount",
+ 		.data		= &grsec_enable_chroot_mount,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_FCHDIR
+ 	{
+ 		.procname	= "chroot_deny_fchdir",
+ 		.data		= &grsec_enable_chroot_fchdir,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_DOUBLE
+ 	{
+ 		.procname	= "chroot_deny_chroot",
+ 		.data		= &grsec_enable_chroot_double,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_PIVOT
+ 	{
+ 		.procname	= "chroot_deny_pivot",
+ 		.data		= &grsec_enable_chroot_pivot,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_CHDIR
+ 	{
+ 		.procname	= "chroot_enforce_chdir",
+ 		.data		= &grsec_enable_chroot_chdir,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_CHMOD
+ 	{
+ 		.procname	= "chroot_deny_chmod",
+ 		.data		= &grsec_enable_chroot_chmod,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_MKNOD
+ 	{
+ 		.procname	= "chroot_deny_mknod",
+ 		.data		= &grsec_enable_chroot_mknod,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_NICE
+ 	{
+ 		.procname	= "chroot_restrict_nice",
+ 		.data		= &grsec_enable_chroot_nice,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_EXECLOG
+ 	{
+ 		.procname	= "chroot_execlog",
+ 		.data		= &grsec_enable_chroot_execlog,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_CAPS
+ 	{
+ 		.procname	= "chroot_caps",
+ 		.data		= &grsec_enable_chroot_caps,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_RENAME
+ 	{
+ 		.procname	= "chroot_deny_bad_rename",
+ 		.data		= &grsec_enable_chroot_rename,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_SYSCTL
+ 	{
+ 		.procname	= "chroot_deny_sysctl",
+ 		.data		= &grsec_enable_chroot_sysctl,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ #ifdef CONFIG_MINISEC_CHROOT_FINDTASK
+ 	{
+ 		.procname	= "chroot_findtask",
+ 		.data		= &grsec_enable_chroot_findtask,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ #endif
+ 	{
+ 		.procname	= "grsec_lock",
+ 		.data		= &grsec_lock,
+ 		.maxlen		= sizeof(int),
+ 		.mode		= 0600,
+ 		.proc_handler	= &proc_dointvec_secure,
+ 	},
+ 	{ }
+ };
+ #endif
diff --color -rcNP Master/security/minisec/tty.c OG/security/minisec/tty.c
*** Master/security/minisec/tty.c	1969-12-31 19:00:00.000000000 -0500
--- OG/security/minisec/tty.c	2021-04-20 15:11:34.526000000 -0400
***************
*** 0 ****
--- 1,16 ----
+ #include <linux/kernel.h>
+ #include <linux/sched.h>
+ #include <linux/minisec.h>
+ #include <linux/capability.h>
+ #include <linux/tty.h>
+ 
+ int gr_handle_tiocsti(struct tty_struct *tty)
+ {
+ #ifdef CONFIG_MINISEC_HARDEN_TTY
+ 	if (grsec_enable_harden_tty && (current->signal->tty == tty) &&
+ 	    !capable(CAP_SYS_ADMIN)) {
+ 		return 1;
+ 	}
+ #endif
+ 	return 0;
+ }
diff --color -rcNP Master/virt/kvm/kvm_main.c OG/virt/kvm/kvm_main.c
*** Master/virt/kvm/kvm_main.c	2021-04-20 14:17:32.000000000 -0400
--- OG/virt/kvm/kvm_main.c	2021-04-20 15:11:34.527000000 -0400
***************
*** 639,644 ****
--- 639,648 ----
  	struct kvm_stat_data *stat_data;
  	struct kvm_stats_debugfs_item *p;
  
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ 	return 0;
+ #endif
+ 
  	if (!debugfs_initialized())
  		return 0;
  
***************
*** 1642,1648 ****
  	 * Whoever called remap_pfn_range is also going to call e.g.
  	 * unmap_mapping_range before the underlying pages are freed,
  	 * causing a call to our MMU notifier.
! 	 */ 
  	kvm_get_pfn(pfn);
  
  out:
--- 1646,1652 ----
  	 * Whoever called remap_pfn_range is also going to call e.g.
  	 * unmap_mapping_range before the underlying pages are freed,
  	 * causing a call to our MMU notifier.
! 	 */
  	kvm_get_pfn(pfn);
  
  out:
diff --color -rcNP Master/virt/kvm/kvm_main.c.orig OG/virt/kvm/kvm_main.c.orig
*** Master/virt/kvm/kvm_main.c.orig	1969-12-31 19:00:00.000000000 -0500
--- OG/virt/kvm/kvm_main.c.orig	2021-04-20 15:10:45.400000000 -0400
***************
*** 0 ****
--- 1,4621 ----
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+  * Kernel-based Virtual Machine driver for Linux
+  *
+  * This module enables machines with Intel VT-x extensions to run virtual
+  * machines without emulation or binary translation.
+  *
+  * Copyright (C) 2006 Qumranet, Inc.
+  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+  *
+  * Authors:
+  *   Avi Kivity   <avi@qumranet.com>
+  *   Yaniv Kamay  <yaniv@qumranet.com>
+  */
+ 
+ #include <kvm/iodev.h>
+ 
+ #include <linux/kvm_host.h>
+ #include <linux/kvm.h>
+ #include <linux/module.h>
+ #include <linux/errno.h>
+ #include <linux/percpu.h>
+ #include <linux/mm.h>
+ #include <linux/miscdevice.h>
+ #include <linux/vmalloc.h>
+ #include <linux/reboot.h>
+ #include <linux/debugfs.h>
+ #include <linux/highmem.h>
+ #include <linux/file.h>
+ #include <linux/syscore_ops.h>
+ #include <linux/cpu.h>
+ #include <linux/sched/signal.h>
+ #include <linux/sched/mm.h>
+ #include <linux/sched/stat.h>
+ #include <linux/cpumask.h>
+ #include <linux/smp.h>
+ #include <linux/anon_inodes.h>
+ #include <linux/profile.h>
+ #include <linux/kvm_para.h>
+ #include <linux/pagemap.h>
+ #include <linux/mman.h>
+ #include <linux/swap.h>
+ #include <linux/bitops.h>
+ #include <linux/spinlock.h>
+ #include <linux/compat.h>
+ #include <linux/srcu.h>
+ #include <linux/hugetlb.h>
+ #include <linux/slab.h>
+ #include <linux/sort.h>
+ #include <linux/bsearch.h>
+ #include <linux/io.h>
+ #include <linux/lockdep.h>
+ #include <linux/kthread.h>
+ 
+ #include <asm/processor.h>
+ #include <asm/ioctl.h>
+ #include <linux/uaccess.h>
+ #include <asm/pgtable.h>
+ 
+ #include "coalesced_mmio.h"
+ #include "async_pf.h"
+ #include "vfio.h"
+ 
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/kvm.h>
+ 
+ /* Worst case buffer size needed for holding an integer. */
+ #define ITOA_MAX_LEN 12
+ 
+ MODULE_AUTHOR("Qumranet");
+ MODULE_LICENSE("GPL");
+ 
+ /* Architectures should define their poll value according to the halt latency */
+ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
+ module_param(halt_poll_ns, uint, 0644);
+ EXPORT_SYMBOL_GPL(halt_poll_ns);
+ 
+ /* Default doubles per-vcpu halt_poll_ns. */
+ unsigned int halt_poll_ns_grow = 2;
+ module_param(halt_poll_ns_grow, uint, 0644);
+ EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
+ 
+ /* The start value to grow halt_poll_ns from */
+ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
+ module_param(halt_poll_ns_grow_start, uint, 0644);
+ EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
+ 
+ /* Default resets per-vcpu halt_poll_ns . */
+ unsigned int halt_poll_ns_shrink;
+ module_param(halt_poll_ns_shrink, uint, 0644);
+ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
+ 
+ /*
+  * Ordering of locks:
+  *
+  *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+  */
+ 
+ DEFINE_MUTEX(kvm_lock);
+ static DEFINE_RAW_SPINLOCK(kvm_count_lock);
+ LIST_HEAD(vm_list);
+ 
+ static cpumask_var_t cpus_hardware_enabled;
+ static int kvm_usage_count;
+ static atomic_t hardware_enable_failed;
+ 
+ struct kmem_cache *kvm_vcpu_cache;
+ EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+ 
+ static __read_mostly struct preempt_ops kvm_preempt_ops;
+ 
+ struct dentry *kvm_debugfs_dir;
+ EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
+ 
+ static int kvm_debugfs_num_entries;
+ static const struct file_operations *stat_fops_per_vm[];
+ 
+ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
+ 			   unsigned long arg);
+ #ifdef CONFIG_KVM_COMPAT
+ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
+ 				  unsigned long arg);
+ #define KVM_COMPAT(c)	.compat_ioctl	= (c)
+ #else
+ /*
+  * For architectures that don't implement a compat infrastructure,
+  * adopt a double line of defense:
+  * - Prevent a compat task from opening /dev/kvm
+  * - If the open has been done by a 64bit task, and the KVM fd
+  *   passed to a compat task, let the ioctls fail.
+  */
+ static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
+ 				unsigned long arg) { return -EINVAL; }
+ 
+ static int kvm_no_compat_open(struct inode *inode, struct file *file)
+ {
+ 	return is_compat_task() ? -ENODEV : 0;
+ }
+ #define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
+ 			.open		= kvm_no_compat_open
+ #endif
+ static int hardware_enable_all(void);
+ static void hardware_disable_all(void);
+ 
+ static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+ 
+ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
+ 
+ __visible bool kvm_rebooting;
+ EXPORT_SYMBOL_GPL(kvm_rebooting);
+ 
+ static bool largepages_enabled = true;
+ 
+ #define KVM_EVENT_CREATE_VM 0
+ #define KVM_EVENT_DESTROY_VM 1
+ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
+ static unsigned long long kvm_createvm_count;
+ static unsigned long long kvm_active_vms;
+ 
+ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ 						   unsigned long start, unsigned long end)
+ {
+ }
+ 
+ bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+ {
+ 	/*
+ 	 * The metadata used by is_zone_device_page() to determine whether or
+ 	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+ 	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
+ 	 * page_count() is zero to help detect bad usage of this helper.
+ 	 */
+ 	if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+ 		return false;
+ 
+ 	return is_zone_device_page(pfn_to_page(pfn));
+ }
+ 
+ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
+ {
+ 	/*
+ 	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+ 	 * perspective they are "normal" pages, albeit with slightly different
+ 	 * usage rules.
+ 	 */
+ 	if (pfn_valid(pfn))
+ 		return PageReserved(pfn_to_page(pfn)) &&
+ 		       !is_zero_pfn(pfn) &&
+ 		       !kvm_is_zone_device_pfn(pfn);
+ 
+ 	return true;
+ }
+ 
+ /*
+  * Switches to specified vcpu, until a matching vcpu_put()
+  */
+ void vcpu_load(struct kvm_vcpu *vcpu)
+ {
+ 	int cpu = get_cpu();
+ 	preempt_notifier_register(&vcpu->preempt_notifier);
+ 	kvm_arch_vcpu_load(vcpu, cpu);
+ 	put_cpu();
+ }
+ EXPORT_SYMBOL_GPL(vcpu_load);
+ 
+ void vcpu_put(struct kvm_vcpu *vcpu)
+ {
+ 	preempt_disable();
+ 	kvm_arch_vcpu_put(vcpu);
+ 	preempt_notifier_unregister(&vcpu->preempt_notifier);
+ 	preempt_enable();
+ }
+ EXPORT_SYMBOL_GPL(vcpu_put);
+ 
+ /* TODO: merge with kvm_arch_vcpu_should_kick */
+ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
+ {
+ 	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
+ 
+ 	/*
+ 	 * We need to wait for the VCPU to reenable interrupts and get out of
+ 	 * READING_SHADOW_PAGE_TABLES mode.
+ 	 */
+ 	if (req & KVM_REQUEST_WAIT)
+ 		return mode != OUTSIDE_GUEST_MODE;
+ 
+ 	/*
+ 	 * Need to kick a running VCPU, but otherwise there is nothing to do.
+ 	 */
+ 	return mode == IN_GUEST_MODE;
+ }
+ 
+ static void ack_flush(void *_completed)
+ {
+ }
+ 
+ static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
+ {
+ 	if (unlikely(!cpus))
+ 		cpus = cpu_online_mask;
+ 
+ 	if (cpumask_empty(cpus))
+ 		return false;
+ 
+ 	smp_call_function_many(cpus, ack_flush, NULL, wait);
+ 	return true;
+ }
+ 
+ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
+ 				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
+ {
+ 	int i, cpu, me;
+ 	struct kvm_vcpu *vcpu;
+ 	bool called;
+ 
+ 	me = get_cpu();
+ 
+ 	kvm_for_each_vcpu(i, vcpu, kvm) {
+ 		if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+ 			continue;
+ 
+ 		kvm_make_request(req, vcpu);
+ 		cpu = vcpu->cpu;
+ 
+ 		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
+ 			continue;
+ 
+ 		if (tmp != NULL && cpu != -1 && cpu != me &&
+ 		    kvm_request_needs_ipi(vcpu, req))
+ 			__cpumask_set_cpu(cpu, tmp);
+ 	}
+ 
+ 	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
+ 	put_cpu();
+ 
+ 	return called;
+ }
+ 
+ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
+ {
+ 	cpumask_var_t cpus;
+ 	bool called;
+ 
+ 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+ 
+ 	called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
+ 
+ 	free_cpumask_var(cpus);
+ 	return called;
+ }
+ 
+ #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
+ void kvm_flush_remote_tlbs(struct kvm *kvm)
+ {
+ 	/*
+ 	 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
+ 	 * kvm_make_all_cpus_request.
+ 	 */
+ 	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
+ 
+ 	/*
+ 	 * We want to publish modifications to the page tables before reading
+ 	 * mode. Pairs with a memory barrier in arch-specific code.
+ 	 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
+ 	 * and smp_mb in walk_shadow_page_lockless_begin/end.
+ 	 * - powerpc: smp_mb in kvmppc_prepare_to_enter.
+ 	 *
+ 	 * There is already an smp_mb__after_atomic() before
+ 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
+ 	 * barrier here.
+ 	 */
+ 	if (!kvm_arch_flush_remote_tlb(kvm)
+ 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+ 		++kvm->stat.remote_tlb_flush;
+ 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+ }
+ EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
+ #endif
+ 
+ void kvm_reload_remote_mmus(struct kvm *kvm)
+ {
+ 	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+ }
+ 
+ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+ {
+ 	struct page *page;
+ 	int r;
+ 
+ 	mutex_init(&vcpu->mutex);
+ 	vcpu->cpu = -1;
+ 	vcpu->kvm = kvm;
+ 	vcpu->vcpu_id = id;
+ 	vcpu->pid = NULL;
+ 	init_swait_queue_head(&vcpu->wq);
+ 	kvm_async_pf_vcpu_init(vcpu);
+ 
+ 	vcpu->pre_pcpu = -1;
+ 	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
+ 
+ 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ 	if (!page) {
+ 		r = -ENOMEM;
+ 		goto fail;
+ 	}
+ 	vcpu->run = page_address(page);
+ 
+ 	kvm_vcpu_set_in_spin_loop(vcpu, false);
+ 	kvm_vcpu_set_dy_eligible(vcpu, false);
+ 	vcpu->preempted = false;
+ 	vcpu->ready = false;
+ 
+ 	r = kvm_arch_vcpu_init(vcpu);
+ 	if (r < 0)
+ 		goto fail_free_run;
+ 	return 0;
+ 
+ fail_free_run:
+ 	free_page((unsigned long)vcpu->run);
+ fail:
+ 	return r;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_init);
+ 
+ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+ {
+ 	/*
+ 	 * no need for rcu_read_lock as VCPU_RUN is the only place that
+ 	 * will change the vcpu->pid pointer and on uninit all file
+ 	 * descriptors are already gone.
+ 	 */
+ 	put_pid(rcu_dereference_protected(vcpu->pid, 1));
+ 	kvm_arch_vcpu_uninit(vcpu);
+ 	free_page((unsigned long)vcpu->run);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+ 
+ #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+ {
+ 	return container_of(mn, struct kvm, mmu_notifier);
+ }
+ 
+ static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+ 					      struct mm_struct *mm,
+ 					      unsigned long start, unsigned long end)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ }
+ 
+ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+ 					struct mm_struct *mm,
+ 					unsigned long address,
+ 					pte_t pte)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	spin_lock(&kvm->mmu_lock);
+ 	kvm->mmu_notifier_seq++;
+ 
+ 	if (kvm_set_spte_hva(kvm, address, pte))
+ 		kvm_flush_remote_tlbs(kvm);
+ 
+ 	spin_unlock(&kvm->mmu_lock);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ }
+ 
+ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ 					const struct mmu_notifier_range *range)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int need_tlb_flush = 0, idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	spin_lock(&kvm->mmu_lock);
+ 	/*
+ 	 * The count increase must become visible at unlock time as no
+ 	 * spte can be established without taking the mmu_lock and
+ 	 * count is also read inside the mmu_lock critical section.
+ 	 */
+ 	kvm->mmu_notifier_count++;
+ 	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
+ 					     range->flags);
+ 	/* we've to flush the tlb before the pages can be freed */
+ 	if (need_tlb_flush || kvm->tlbs_dirty)
+ 		kvm_flush_remote_tlbs(kvm);
+ 
+ 	spin_unlock(&kvm->mmu_lock);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ 
+ 	return 0;
+ }
+ 
+ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ 					const struct mmu_notifier_range *range)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 
+ 	spin_lock(&kvm->mmu_lock);
+ 	/*
+ 	 * This sequence increase will notify the kvm page fault that
+ 	 * the page that is going to be mapped in the spte could have
+ 	 * been freed.
+ 	 */
+ 	kvm->mmu_notifier_seq++;
+ 	smp_wmb();
+ 	/*
+ 	 * The above sequence increase must be visible before the
+ 	 * below count decrease, which is ensured by the smp_wmb above
+ 	 * in conjunction with the smp_rmb in mmu_notifier_retry().
+ 	 */
+ 	kvm->mmu_notifier_count--;
+ 	spin_unlock(&kvm->mmu_lock);
+ 
+ 	BUG_ON(kvm->mmu_notifier_count < 0);
+ }
+ 
+ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+ 					      struct mm_struct *mm,
+ 					      unsigned long start,
+ 					      unsigned long end)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int young, idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	spin_lock(&kvm->mmu_lock);
+ 
+ 	young = kvm_age_hva(kvm, start, end);
+ 	if (young)
+ 		kvm_flush_remote_tlbs(kvm);
+ 
+ 	spin_unlock(&kvm->mmu_lock);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ 
+ 	return young;
+ }
+ 
+ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+ 					struct mm_struct *mm,
+ 					unsigned long start,
+ 					unsigned long end)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int young, idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	spin_lock(&kvm->mmu_lock);
+ 	/*
+ 	 * Even though we do not flush TLB, this will still adversely
+ 	 * affect performance on pre-Haswell Intel EPT, where there is
+ 	 * no EPT Access Bit to clear so that we have to tear down EPT
+ 	 * tables instead. If we find this unacceptable, we can always
+ 	 * add a parameter to kvm_age_hva so that it effectively doesn't
+ 	 * do anything on clear_young.
+ 	 *
+ 	 * Also note that currently we never issue secondary TLB flushes
+ 	 * from clear_young, leaving this job up to the regular system
+ 	 * cadence. If we find this inaccurate, we might come up with a
+ 	 * more sophisticated heuristic later.
+ 	 */
+ 	young = kvm_age_hva(kvm, start, end);
+ 	spin_unlock(&kvm->mmu_lock);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ 
+ 	return young;
+ }
+ 
+ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
+ 				       struct mm_struct *mm,
+ 				       unsigned long address)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int young, idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	spin_lock(&kvm->mmu_lock);
+ 	young = kvm_test_age_hva(kvm, address);
+ 	spin_unlock(&kvm->mmu_lock);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ 
+ 	return young;
+ }
+ 
+ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+ 				     struct mm_struct *mm)
+ {
+ 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ 	int idx;
+ 
+ 	idx = srcu_read_lock(&kvm->srcu);
+ 	kvm_arch_flush_shadow_all(kvm);
+ 	srcu_read_unlock(&kvm->srcu, idx);
+ }
+ 
+ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ 	.invalidate_range	= kvm_mmu_notifier_invalidate_range,
+ 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
+ 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
+ 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+ 	.clear_young		= kvm_mmu_notifier_clear_young,
+ 	.test_young		= kvm_mmu_notifier_test_young,
+ 	.change_pte		= kvm_mmu_notifier_change_pte,
+ 	.release		= kvm_mmu_notifier_release,
+ };
+ 
+ static int kvm_init_mmu_notifier(struct kvm *kvm)
+ {
+ 	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
+ 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
+ }
+ 
+ #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+ 
+ static int kvm_init_mmu_notifier(struct kvm *kvm)
+ {
+ 	return 0;
+ }
+ 
+ #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+ 
+ static struct kvm_memslots *kvm_alloc_memslots(void)
+ {
+ 	int i;
+ 	struct kvm_memslots *slots;
+ 
+ 	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
+ 	if (!slots)
+ 		return NULL;
+ 
+ 	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
+ 		slots->id_to_index[i] = slots->memslots[i].id = i;
+ 
+ 	return slots;
+ }
+ 
+ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
+ {
+ 	if (!memslot->dirty_bitmap)
+ 		return;
+ 
+ 	kvfree(memslot->dirty_bitmap);
+ 	memslot->dirty_bitmap = NULL;
+ }
+ 
+ /*
+  * Free any memory in @free but not in @dont.
+  */
+ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+ 			      struct kvm_memory_slot *dont)
+ {
+ 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
+ 		kvm_destroy_dirty_bitmap(free);
+ 
+ 	kvm_arch_free_memslot(kvm, free, dont);
+ 
+ 	free->npages = 0;
+ }
+ 
+ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
+ {
+ 	struct kvm_memory_slot *memslot;
+ 
+ 	if (!slots)
+ 		return;
+ 
+ 	kvm_for_each_memslot(memslot, slots)
+ 		kvm_free_memslot(kvm, memslot, NULL);
+ 
+ 	kvfree(slots);
+ }
+ 
+ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
+ {
+ 	int i;
+ 
+ 	if (!kvm->debugfs_dentry)
+ 		return;
+ 
+ 	debugfs_remove_recursive(kvm->debugfs_dentry);
+ 
+ 	if (kvm->debugfs_stat_data) {
+ 		for (i = 0; i < kvm_debugfs_num_entries; i++)
+ 			kfree(kvm->debugfs_stat_data[i]);
+ 		kfree(kvm->debugfs_stat_data);
+ 	}
+ }
+ 
+ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+ {
+ 	char dir_name[ITOA_MAX_LEN * 2];
+ 	struct kvm_stat_data *stat_data;
+ 	struct kvm_stats_debugfs_item *p;
+ 
+ #ifdef CONFIG_MINISEC_SYSFS_RESTRICT
+ 	return 0;
+ #endif
+ 
+ 	if (!debugfs_initialized())
+ 		return 0;
+ 
+ 	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
+ 	kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+ 
+ 	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
+ 					 sizeof(*kvm->debugfs_stat_data),
+ 					 GFP_KERNEL_ACCOUNT);
+ 	if (!kvm->debugfs_stat_data)
+ 		return -ENOMEM;
+ 
+ 	for (p = debugfs_entries; p->name; p++) {
+ 		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
+ 		if (!stat_data)
+ 			return -ENOMEM;
+ 
+ 		stat_data->kvm = kvm;
+ 		stat_data->offset = p->offset;
+ 		stat_data->mode = p->mode ? p->mode : 0644;
+ 		kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
+ 		debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
+ 				    stat_data, stat_fops_per_vm[p->kind]);
+ 	}
+ 	return 0;
+ }
+ 
+ /*
+  * Called after the VM is otherwise initialized, but just before adding it to
+  * the vm_list.
+  */
+ int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+ {
+ 	return 0;
+ }
+ 
+ /*
+  * Called just after removing the VM from the vm_list, but before doing any
+  * other destruction.
+  */
+ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+ {
+ }
+ 
+ static struct kvm *kvm_create_vm(unsigned long type)
+ {
+ 	struct kvm *kvm = kvm_arch_alloc_vm();
+ 	int r = -ENOMEM;
+ 	int i;
+ 
+ 	if (!kvm)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	spin_lock_init(&kvm->mmu_lock);
+ 	mmgrab(current->mm);
+ 	kvm->mm = current->mm;
+ 	kvm_eventfd_init(kvm);
+ 	mutex_init(&kvm->lock);
+ 	mutex_init(&kvm->irq_lock);
+ 	mutex_init(&kvm->slots_lock);
+ 	INIT_LIST_HEAD(&kvm->devices);
+ 
+ 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
+ 
+ 	if (init_srcu_struct(&kvm->srcu))
+ 		goto out_err_no_srcu;
+ 	if (init_srcu_struct(&kvm->irq_srcu))
+ 		goto out_err_no_irq_srcu;
+ 
+ 	refcount_set(&kvm->users_count, 1);
+ 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ 		struct kvm_memslots *slots = kvm_alloc_memslots();
+ 
+ 		if (!slots)
+ 			goto out_err_no_arch_destroy_vm;
+ 		/* Generations must be different for each address space. */
+ 		slots->generation = i;
+ 		rcu_assign_pointer(kvm->memslots[i], slots);
+ 	}
+ 
+ 	for (i = 0; i < KVM_NR_BUSES; i++) {
+ 		rcu_assign_pointer(kvm->buses[i],
+ 			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
+ 		if (!kvm->buses[i])
+ 			goto out_err_no_arch_destroy_vm;
+ 	}
+ 
+ 	r = kvm_arch_init_vm(kvm, type);
+ 	if (r)
+ 		goto out_err_no_arch_destroy_vm;
+ 
+ 	r = hardware_enable_all();
+ 	if (r)
+ 		goto out_err_no_disable;
+ 
+ #ifdef CONFIG_HAVE_KVM_IRQFD
+ 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+ #endif
+ 
+ 	r = kvm_init_mmu_notifier(kvm);
+ 	if (r)
+ 		goto out_err_no_mmu_notifier;
+ 
+ 	r = kvm_arch_post_init_vm(kvm);
+ 	if (r)
+ 		goto out_err;
+ 
+ 	mutex_lock(&kvm_lock);
+ 	list_add(&kvm->vm_list, &vm_list);
+ 	mutex_unlock(&kvm_lock);
+ 
+ 	preempt_notifier_inc();
+ 
+ 	return kvm;
+ 
+ out_err:
+ #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ 	if (kvm->mmu_notifier.ops)
+ 		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+ #endif
+ out_err_no_mmu_notifier:
+ 	hardware_disable_all();
+ out_err_no_disable:
+ 	kvm_arch_destroy_vm(kvm);
+ out_err_no_arch_destroy_vm:
+ 	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
+ 	for (i = 0; i < KVM_NR_BUSES; i++)
+ 		kfree(kvm_get_bus(kvm, i));
+ 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ 		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+ 	cleanup_srcu_struct(&kvm->irq_srcu);
+ out_err_no_irq_srcu:
+ 	cleanup_srcu_struct(&kvm->srcu);
+ out_err_no_srcu:
+ 	kvm_arch_free_vm(kvm);
+ 	mmdrop(current->mm);
+ 	return ERR_PTR(r);
+ }
+ 
+ static void kvm_destroy_devices(struct kvm *kvm)
+ {
+ 	struct kvm_device *dev, *tmp;
+ 
+ 	/*
+ 	 * We do not need to take the kvm->lock here, because nobody else
+ 	 * has a reference to the struct kvm at this point and therefore
+ 	 * cannot access the devices list anyhow.
+ 	 */
+ 	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
+ 		list_del(&dev->vm_node);
+ 		dev->ops->destroy(dev);
+ 	}
+ }
+ 
+ static void kvm_destroy_vm(struct kvm *kvm)
+ {
+ 	int i;
+ 	struct mm_struct *mm = kvm->mm;
+ 
+ 	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
+ 	kvm_destroy_vm_debugfs(kvm);
+ 	kvm_arch_sync_events(kvm);
+ 	mutex_lock(&kvm_lock);
+ 	list_del(&kvm->vm_list);
+ 	mutex_unlock(&kvm_lock);
+ 	kvm_arch_pre_destroy_vm(kvm);
+ 
+ 	kvm_free_irq_routing(kvm);
+ 	for (i = 0; i < KVM_NR_BUSES; i++) {
+ 		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
+ 
+ 		if (bus)
+ 			kvm_io_bus_destroy(bus);
+ 		kvm->buses[i] = NULL;
+ 	}
+ 	kvm_coalesced_mmio_free(kvm);
+ #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+ #else
+ 	kvm_arch_flush_shadow_all(kvm);
+ #endif
+ 	kvm_arch_destroy_vm(kvm);
+ 	kvm_destroy_devices(kvm);
+ 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+ 		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+ 	cleanup_srcu_struct(&kvm->irq_srcu);
+ 	cleanup_srcu_struct(&kvm->srcu);
+ 	kvm_arch_free_vm(kvm);
+ 	preempt_notifier_dec();
+ 	hardware_disable_all();
+ 	mmdrop(mm);
+ }
+ 
+ void kvm_get_kvm(struct kvm *kvm)
+ {
+ 	refcount_inc(&kvm->users_count);
+ }
+ EXPORT_SYMBOL_GPL(kvm_get_kvm);
+ 
+ void kvm_put_kvm(struct kvm *kvm)
+ {
+ 	if (refcount_dec_and_test(&kvm->users_count))
+ 		kvm_destroy_vm(kvm);
+ }
+ EXPORT_SYMBOL_GPL(kvm_put_kvm);
+ 
+ 
+ static int kvm_vm_release(struct inode *inode, struct file *filp)
+ {
+ 	struct kvm *kvm = filp->private_data;
+ 
+ 	kvm_irqfd_release(kvm);
+ 
+ 	kvm_put_kvm(kvm);
+ 	return 0;
+ }
+ 
+ /*
+  * Allocation size is twice as large as the actual dirty bitmap size.
+  * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
+  */
+ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+ {
+ 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
+ 
+ 	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
+ 	if (!memslot->dirty_bitmap)
+ 		return -ENOMEM;
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Insert memslot and re-sort memslots based on their GFN,
+  * so binary search could be used to lookup GFN.
+  * Sorting algorithm takes advantage of having initially
+  * sorted array and known changed memslot position.
+  */
+ static void update_memslots(struct kvm_memslots *slots,
+ 			    struct kvm_memory_slot *new,
+ 			    enum kvm_mr_change change)
+ {
+ 	int id = new->id;
+ 	int i = slots->id_to_index[id];
+ 	struct kvm_memory_slot *mslots = slots->memslots;
+ 
+ 	WARN_ON(mslots[i].id != id);
+ 	switch (change) {
+ 	case KVM_MR_CREATE:
+ 		slots->used_slots++;
+ 		WARN_ON(mslots[i].npages || !new->npages);
+ 		break;
+ 	case KVM_MR_DELETE:
+ 		slots->used_slots--;
+ 		WARN_ON(new->npages || !mslots[i].npages);
+ 		break;
+ 	default:
+ 		break;
+ 	}
+ 
+ 	while (i < KVM_MEM_SLOTS_NUM - 1 &&
+ 	       new->base_gfn <= mslots[i + 1].base_gfn) {
+ 		if (!mslots[i + 1].npages)
+ 			break;
+ 		mslots[i] = mslots[i + 1];
+ 		slots->id_to_index[mslots[i].id] = i;
+ 		i++;
+ 	}
+ 
+ 	/*
+ 	 * The ">=" is needed when creating a slot with base_gfn == 0,
+ 	 * so that it moves before all those with base_gfn == npages == 0.
+ 	 *
+ 	 * On the other hand, if new->npages is zero, the above loop has
+ 	 * already left i pointing to the beginning of the empty part of
+ 	 * mslots, and the ">=" would move the hole backwards in this
+ 	 * case---which is wrong.  So skip the loop when deleting a slot.
+ 	 */
+ 	if (new->npages) {
+ 		while (i > 0 &&
+ 		       new->base_gfn >= mslots[i - 1].base_gfn) {
+ 			mslots[i] = mslots[i - 1];
+ 			slots->id_to_index[mslots[i].id] = i;
+ 			i--;
+ 		}
+ 	} else
+ 		WARN_ON_ONCE(i != slots->used_slots);
+ 
+ 	mslots[i] = *new;
+ 	slots->id_to_index[mslots[i].id] = i;
+ }
+ 
+ static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+ {
+ 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+ 
+ #ifdef __KVM_HAVE_READONLY_MEM
+ 	valid_flags |= KVM_MEM_READONLY;
+ #endif
+ 
+ 	if (mem->flags & ~valid_flags)
+ 		return -EINVAL;
+ 
+ 	return 0;
+ }
+ 
+ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
+ 		int as_id, struct kvm_memslots *slots)
+ {
+ 	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+ 	u64 gen = old_memslots->generation;
+ 
+ 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+ 	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
+ 
+ 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+ 	synchronize_srcu_expedited(&kvm->srcu);
+ 
+ 	/*
+ 	 * Increment the new memslot generation a second time, dropping the
+ 	 * update in-progress flag and incrementing then generation based on
+ 	 * the number of address spaces.  This provides a unique and easily
+ 	 * identifiable generation number while the memslots are in flux.
+ 	 */
+ 	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
+ 
+ 	/*
+ 	 * Generations must be unique even across address spaces.  We do not need
+ 	 * a global counter for that, instead the generation space is evenly split
+ 	 * across address spaces.  For example, with two address spaces, address
+ 	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
+ 	 * use generations 1, 3, 5, ...
+ 	 */
+ 	gen += KVM_ADDRESS_SPACE_NUM;
+ 
+ 	kvm_arch_memslots_updated(kvm, gen);
+ 
+ 	slots->generation = gen;
+ 
+ 	return old_memslots;
+ }
+ 
+ /*
+  * Allocate some memory and give it an address in the guest physical address
+  * space.
+  *
+  * Discontiguous memory is allowed, mostly for framebuffers.
+  *
+  * Must be called holding kvm->slots_lock for write.
+  */
+ int __kvm_set_memory_region(struct kvm *kvm,
+ 			    const struct kvm_userspace_memory_region *mem)
+ {
+ 	int r;
+ 	gfn_t base_gfn;
+ 	unsigned long npages;
+ 	struct kvm_memory_slot *slot;
+ 	struct kvm_memory_slot old, new;
+ 	struct kvm_memslots *slots = NULL, *old_memslots;
+ 	int as_id, id;
+ 	enum kvm_mr_change change;
+ 
+ 	r = check_memory_region_flags(mem);
+ 	if (r)
+ 		goto out;
+ 
+ 	r = -EINVAL;
+ 	as_id = mem->slot >> 16;
+ 	id = (u16)mem->slot;
+ 
+ 	/* General sanity checks */
+ 	if (mem->memory_size & (PAGE_SIZE - 1))
+ 		goto out;
+ 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
+ 		goto out;
+ 	/* We can read the guest memory with __xxx_user() later on. */
+ 	if ((id < KVM_USER_MEM_SLOTS) &&
+ 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+ 	     (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
+ 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
+ 			mem->memory_size)))
+ 		goto out;
+ 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
+ 		goto out;
+ 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
+ 		goto out;
+ 
+ 	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+ 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
+ 	npages = mem->memory_size >> PAGE_SHIFT;
+ 
+ 	if (npages > KVM_MEM_MAX_NR_PAGES)
+ 		goto out;
+ 
+ 	new = old = *slot;
+ 
+ 	new.id = id;
+ 	new.base_gfn = base_gfn;
+ 	new.npages = npages;
+ 	new.flags = mem->flags;
+ 
+ 	if (npages) {
+ 		if (!old.npages)
+ 			change = KVM_MR_CREATE;
+ 		else { /* Modify an existing slot. */
+ 			if ((mem->userspace_addr != old.userspace_addr) ||
+ 			    (npages != old.npages) ||
+ 			    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
+ 				goto out;
+ 
+ 			if (base_gfn != old.base_gfn)
+ 				change = KVM_MR_MOVE;
+ 			else if (new.flags != old.flags)
+ 				change = KVM_MR_FLAGS_ONLY;
+ 			else { /* Nothing to change. */
+ 				r = 0;
+ 				goto out;
+ 			}
+ 		}
+ 	} else {
+ 		if (!old.npages)
+ 			goto out;
+ 
+ 		change = KVM_MR_DELETE;
+ 		new.base_gfn = 0;
+ 		new.flags = 0;
+ 	}
+ 
+ 	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+ 		/* Check for overlaps */
+ 		r = -EEXIST;
+ 		kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
+ 			if (slot->id == id)
+ 				continue;
+ 			if (!((base_gfn + npages <= slot->base_gfn) ||
+ 			      (base_gfn >= slot->base_gfn + slot->npages)))
+ 				goto out;
+ 		}
+ 	}
+ 
+ 	/* Free page dirty bitmap if unneeded */
+ 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
+ 		new.dirty_bitmap = NULL;
+ 
+ 	r = -ENOMEM;
+ 	if (change == KVM_MR_CREATE) {
+ 		new.userspace_addr = mem->userspace_addr;
+ 
+ 		if (kvm_arch_create_memslot(kvm, &new, npages))
+ 			goto out_free;
+ 	}
+ 
+ 	/* Allocate page dirty bitmap if needed */
+ 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
+ 		if (kvm_create_dirty_bitmap(&new) < 0)
+ 			goto out_free;
+ 	}
+ 
+ 	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
+ 	if (!slots)
+ 		goto out_free;
+ 	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
+ 
+ 	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
+ 		slot = id_to_memslot(slots, id);
+ 		slot->flags |= KVM_MEMSLOT_INVALID;
+ 
+ 		old_memslots = install_new_memslots(kvm, as_id, slots);
+ 
+ 		/* From this point no new shadow pages pointing to a deleted,
+ 		 * or moved, memslot will be created.
+ 		 *
+ 		 * validation of sp->gfn happens in:
+ 		 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+ 		 *	- kvm_is_visible_gfn (mmu_check_roots)
+ 		 */
+ 		kvm_arch_flush_shadow_memslot(kvm, slot);
+ 
+ 		/*
+ 		 * We can re-use the old_memslots from above, the only difference
+ 		 * from the currently installed memslots is the invalid flag.  This
+ 		 * will get overwritten by update_memslots anyway.
+ 		 */
+ 		slots = old_memslots;
+ 	}
+ 
+ 	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
+ 	if (r)
+ 		goto out_slots;
+ 
+ 	/* actual memory is freed via old in kvm_free_memslot below */
+ 	if (change == KVM_MR_DELETE) {
+ 		new.dirty_bitmap = NULL;
+ 		memset(&new.arch, 0, sizeof(new.arch));
+ 	}
+ 
+ 	update_memslots(slots, &new, change);
+ 	old_memslots = install_new_memslots(kvm, as_id, slots);
+ 
+ 	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
+ 
+ 	kvm_free_memslot(kvm, &old, &new);
+ 	kvfree(old_memslots);
+ 	return 0;
+ 
+ out_slots:
+ 	kvfree(slots);
+ out_free:
+ 	kvm_free_memslot(kvm, &new, &old);
+ out:
+ 	return r;
+ }
+ EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
+ 
+ int kvm_set_memory_region(struct kvm *kvm,
+ 			  const struct kvm_userspace_memory_region *mem)
+ {
+ 	int r;
+ 
+ 	mutex_lock(&kvm->slots_lock);
+ 	r = __kvm_set_memory_region(kvm, mem);
+ 	mutex_unlock(&kvm->slots_lock);
+ 	return r;
+ }
+ EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+ 
+ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
+ 					  struct kvm_userspace_memory_region *mem)
+ {
+ 	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
+ 		return -EINVAL;
+ 
+ 	return kvm_set_memory_region(kvm, mem);
+ }
+ 
+ int kvm_get_dirty_log(struct kvm *kvm,
+ 			struct kvm_dirty_log *log, int *is_dirty)
+ {
+ 	struct kvm_memslots *slots;
+ 	struct kvm_memory_slot *memslot;
+ 	int i, as_id, id;
+ 	unsigned long n;
+ 	unsigned long any = 0;
+ 
+ 	as_id = log->slot >> 16;
+ 	id = (u16)log->slot;
+ 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ 		return -EINVAL;
+ 
+ 	slots = __kvm_memslots(kvm, as_id);
+ 	memslot = id_to_memslot(slots, id);
+ 	if (!memslot->dirty_bitmap)
+ 		return -ENOENT;
+ 
+ 	n = kvm_dirty_bitmap_bytes(memslot);
+ 
+ 	for (i = 0; !any && i < n/sizeof(long); ++i)
+ 		any = memslot->dirty_bitmap[i];
+ 
+ 	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+ 		return -EFAULT;
+ 
+ 	if (any)
+ 		*is_dirty = 1;
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
+ 
+ #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ /**
+  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
+  *	and reenable dirty page tracking for the corresponding pages.
+  * @kvm:	pointer to kvm instance
+  * @log:	slot id and address to which we copy the log
+  * @flush:	true if TLB flush is needed by caller
+  *
+  * We need to keep it in mind that VCPU threads can write to the bitmap
+  * concurrently. So, to avoid losing track of dirty pages we keep the
+  * following order:
+  *
+  *    1. Take a snapshot of the bit and clear it if needed.
+  *    2. Write protect the corresponding page.
+  *    3. Copy the snapshot to the userspace.
+  *    4. Upon return caller flushes TLB's if needed.
+  *
+  * Between 2 and 4, the guest may write to the page using the remaining TLB
+  * entry.  This is not a problem because the page is reported dirty using
+  * the snapshot taken before and step 4 ensures that writes done after
+  * exiting to userspace will be logged for the next call.
+  *
+  */
+ int kvm_get_dirty_log_protect(struct kvm *kvm,
+ 			struct kvm_dirty_log *log, bool *flush)
+ {
+ 	struct kvm_memslots *slots;
+ 	struct kvm_memory_slot *memslot;
+ 	int i, as_id, id;
+ 	unsigned long n;
+ 	unsigned long *dirty_bitmap;
+ 	unsigned long *dirty_bitmap_buffer;
+ 
+ 	as_id = log->slot >> 16;
+ 	id = (u16)log->slot;
+ 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ 		return -EINVAL;
+ 
+ 	slots = __kvm_memslots(kvm, as_id);
+ 	memslot = id_to_memslot(slots, id);
+ 
+ 	dirty_bitmap = memslot->dirty_bitmap;
+ 	if (!dirty_bitmap)
+ 		return -ENOENT;
+ 
+ 	n = kvm_dirty_bitmap_bytes(memslot);
+ 	*flush = false;
+ 	if (kvm->manual_dirty_log_protect) {
+ 		/*
+ 		 * Unlike kvm_get_dirty_log, we always return false in *flush,
+ 		 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
+ 		 * is some code duplication between this function and
+ 		 * kvm_get_dirty_log, but hopefully all architecture
+ 		 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
+ 		 * can be eliminated.
+ 		 */
+ 		dirty_bitmap_buffer = dirty_bitmap;
+ 	} else {
+ 		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
+ 		memset(dirty_bitmap_buffer, 0, n);
+ 
+ 		spin_lock(&kvm->mmu_lock);
+ 		for (i = 0; i < n / sizeof(long); i++) {
+ 			unsigned long mask;
+ 			gfn_t offset;
+ 
+ 			if (!dirty_bitmap[i])
+ 				continue;
+ 
+ 			*flush = true;
+ 			mask = xchg(&dirty_bitmap[i], 0);
+ 			dirty_bitmap_buffer[i] = mask;
+ 
+ 			offset = i * BITS_PER_LONG;
+ 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+ 								offset, mask);
+ 		}
+ 		spin_unlock(&kvm->mmu_lock);
+ 	}
+ 
+ 	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+ 		return -EFAULT;
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+ 
+ /**
+  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
+  *	and reenable dirty page tracking for the corresponding pages.
+  * @kvm:	pointer to kvm instance
+  * @log:	slot id and address from which to fetch the bitmap of dirty pages
+  * @flush:	true if TLB flush is needed by caller
+  */
+ int kvm_clear_dirty_log_protect(struct kvm *kvm,
+ 				struct kvm_clear_dirty_log *log, bool *flush)
+ {
+ 	struct kvm_memslots *slots;
+ 	struct kvm_memory_slot *memslot;
+ 	int as_id, id;
+ 	gfn_t offset;
+ 	unsigned long i, n;
+ 	unsigned long *dirty_bitmap;
+ 	unsigned long *dirty_bitmap_buffer;
+ 
+ 	as_id = log->slot >> 16;
+ 	id = (u16)log->slot;
+ 	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ 		return -EINVAL;
+ 
+ 	if (log->first_page & 63)
+ 		return -EINVAL;
+ 
+ 	slots = __kvm_memslots(kvm, as_id);
+ 	memslot = id_to_memslot(slots, id);
+ 
+ 	dirty_bitmap = memslot->dirty_bitmap;
+ 	if (!dirty_bitmap)
+ 		return -ENOENT;
+ 
+ 	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
+ 
+ 	if (log->first_page > memslot->npages ||
+ 	    log->num_pages > memslot->npages - log->first_page ||
+ 	    (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
+ 	    return -EINVAL;
+ 
+ 	*flush = false;
+ 	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
+ 	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
+ 		return -EFAULT;
+ 
+ 	spin_lock(&kvm->mmu_lock);
+ 	for (offset = log->first_page, i = offset / BITS_PER_LONG,
+ 		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
+ 	     i++, offset += BITS_PER_LONG) {
+ 		unsigned long mask = *dirty_bitmap_buffer++;
+ 		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
+ 		if (!mask)
+ 			continue;
+ 
+ 		mask &= atomic_long_fetch_andnot(mask, p);
+ 
+ 		/*
+ 		 * mask contains the bits that really have been cleared.  This
+ 		 * never includes any bits beyond the length of the memslot (if
+ 		 * the length is not aligned to 64 pages), therefore it is not
+ 		 * a problem if userspace sets them in log->dirty_bitmap.
+ 		*/
+ 		if (mask) {
+ 			*flush = true;
+ 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+ 								offset, mask);
+ 		}
+ 	}
+ 	spin_unlock(&kvm->mmu_lock);
+ 
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
+ #endif
+ 
+ bool kvm_largepages_enabled(void)
+ {
+ 	return largepages_enabled;
+ }
+ 
+ void kvm_disable_largepages(void)
+ {
+ 	largepages_enabled = false;
+ }
+ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+ 
+ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+ {
+ 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_memslot);
+ 
+ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+ }
+ 
+ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
+ {
+ 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
+ 
+ 	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
+ 	      memslot->flags & KVM_MEMSLOT_INVALID)
+ 		return false;
+ 
+ 	return true;
+ }
+ EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+ 
+ unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	struct vm_area_struct *vma;
+ 	unsigned long addr, size;
+ 
+ 	size = PAGE_SIZE;
+ 
+ 	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
+ 	if (kvm_is_error_hva(addr))
+ 		return PAGE_SIZE;
+ 
+ 	down_read(&current->mm->mmap_sem);
+ 	vma = find_vma(current->mm, addr);
+ 	if (!vma)
+ 		goto out;
+ 
+ 	size = vma_kernel_pagesize(vma);
+ 
+ out:
+ 	up_read(&current->mm->mmap_sem);
+ 
+ 	return size;
+ }
+ 
+ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+ {
+ 	return slot->flags & KVM_MEM_READONLY;
+ }
+ 
+ static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ 				       gfn_t *nr_pages, bool write)
+ {
+ 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ 		return KVM_HVA_ERR_BAD;
+ 
+ 	if (memslot_is_readonly(slot) && write)
+ 		return KVM_HVA_ERR_RO_BAD;
+ 
+ 	if (nr_pages)
+ 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
+ 
+ 	return __gfn_to_hva_memslot(slot, gfn);
+ }
+ 
+ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+ 				     gfn_t *nr_pages)
+ {
+ 	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+ }
+ 
+ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ 					gfn_t gfn)
+ {
+ 	return gfn_to_hva_many(slot, gfn, NULL);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+ 
+ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+ {
+ 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_hva);
+ 
+ unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
+ 
+ /*
+  * Return the hva of a @gfn and the R/W attribute if possible.
+  *
+  * @slot: the kvm_memory_slot which contains @gfn
+  * @gfn: the gfn to be translated
+  * @writable: used to return the read/write attribute of the @slot if the hva
+  * is valid and @writable is not NULL
+  */
+ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
+ 				      gfn_t gfn, bool *writable)
+ {
+ 	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
+ 
+ 	if (!kvm_is_error_hva(hva) && writable)
+ 		*writable = !memslot_is_readonly(slot);
+ 
+ 	return hva;
+ }
+ 
+ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
+ {
+ 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ 
+ 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
+ }
+ 
+ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
+ {
+ 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ 
+ 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
+ }
+ 
+ static inline int check_user_page_hwpoison(unsigned long addr)
+ {
+ 	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
+ 
+ 	rc = get_user_pages(addr, 1, flags, NULL, NULL);
+ 	return rc == -EHWPOISON;
+ }
+ 
+ /*
+  * The fast path to get the writable pfn which will be stored in @pfn,
+  * true indicates success, otherwise false is returned.  It's also the
+  * only part that runs if we can are in atomic context.
+  */
+ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
+ 			    bool *writable, kvm_pfn_t *pfn)
+ {
+ 	struct page *page[1];
+ 	int npages;
+ 
+ 	/*
+ 	 * Fast pin a writable pfn only if it is a write fault request
+ 	 * or the caller allows to map a writable pfn for a read fault
+ 	 * request.
+ 	 */
+ 	if (!(write_fault || writable))
+ 		return false;
+ 
+ 	npages = __get_user_pages_fast(addr, 1, 1, page);
+ 	if (npages == 1) {
+ 		*pfn = page_to_pfn(page[0]);
+ 
+ 		if (writable)
+ 			*writable = true;
+ 		return true;
+ 	}
+ 
+ 	return false;
+ }
+ 
+ /*
+  * The slow path to get the pfn of the specified host virtual address,
+  * 1 indicates success, -errno is returned if error is detected.
+  */
+ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+ 			   bool *writable, kvm_pfn_t *pfn)
+ {
+ 	unsigned int flags = FOLL_HWPOISON;
+ 	struct page *page;
+ 	int npages = 0;
+ 
+ 	might_sleep();
+ 
+ 	if (writable)
+ 		*writable = write_fault;
+ 
+ 	if (write_fault)
+ 		flags |= FOLL_WRITE;
+ 	if (async)
+ 		flags |= FOLL_NOWAIT;
+ 
+ 	npages = get_user_pages_unlocked(addr, 1, &page, flags);
+ 	if (npages != 1)
+ 		return npages;
+ 
+ 	/* map read fault as writable if possible */
+ 	if (unlikely(!write_fault) && writable) {
+ 		struct page *wpage;
+ 
+ 		if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
+ 			*writable = true;
+ 			put_page(page);
+ 			page = wpage;
+ 		}
+ 	}
+ 	*pfn = page_to_pfn(page);
+ 	return npages;
+ }
+ 
+ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+ {
+ 	if (unlikely(!(vma->vm_flags & VM_READ)))
+ 		return false;
+ 
+ 	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+ 			       unsigned long addr, bool *async,
+ 			       bool write_fault, bool *writable,
+ 			       kvm_pfn_t *p_pfn)
+ {
+ 	kvm_pfn_t pfn;
+ 	pte_t *ptep;
+ 	spinlock_t *ptl;
+ 	int r;
+ 
+ 	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+ 	if (r) {
+ 		/*
+ 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+ 		 * not call the fault handler, so do it here.
+ 		 */
+ 		bool unlocked = false;
+ 		r = fixup_user_fault(current, current->mm, addr,
+ 				     (write_fault ? FAULT_FLAG_WRITE : 0),
+ 				     &unlocked);
+ 		if (unlocked)
+ 			return -EAGAIN;
+ 		if (r)
+ 			return r;
+ 
+ 		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+ 		if (r)
+ 			return r;
+ 	}
+ 
+ 	if (write_fault && !pte_write(*ptep)) {
+ 		pfn = KVM_PFN_ERR_RO_FAULT;
+ 		goto out;
+ 	}
+ 
+ 	if (writable)
+ 		*writable = pte_write(*ptep);
+ 	pfn = pte_pfn(*ptep);
+ 
+ 	/*
+ 	 * Get a reference here because callers of *hva_to_pfn* and
+ 	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+ 	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+ 	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+ 	 * simply do nothing for reserved pfns.
+ 	 *
+ 	 * Whoever called remap_pfn_range is also going to call e.g.
+ 	 * unmap_mapping_range before the underlying pages are freed,
+ 	 * causing a call to our MMU notifier.
+ 	 */ 
+ 	kvm_get_pfn(pfn);
+ 
+ out:
+ 	pte_unmap_unlock(ptep, ptl);
+ 	*p_pfn = pfn;
+ 	return 0;
+ }
+ 
+ /*
+  * Pin guest page in memory and return its pfn.
+  * @addr: host virtual address which maps memory to the guest
+  * @atomic: whether this function can sleep
+  * @async: whether this function need to wait IO complete if the
+  *         host page is not in the memory
+  * @write_fault: whether we should get a writable host page
+  * @writable: whether it allows to map a writable host page for !@write_fault
+  *
+  * The function will map a writable host page for these two cases:
+  * 1): @write_fault = true
+  * 2): @write_fault = false && @writable, @writable will tell the caller
+  *     whether the mapping is writable.
+  */
+ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ 			bool write_fault, bool *writable)
+ {
+ 	struct vm_area_struct *vma;
+ 	kvm_pfn_t pfn = 0;
+ 	int npages, r;
+ 
+ 	/* we can do it either atomically or asynchronously, not both */
+ 	BUG_ON(atomic && async);
+ 
+ 	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
+ 		return pfn;
+ 
+ 	if (atomic)
+ 		return KVM_PFN_ERR_FAULT;
+ 
+ 	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+ 	if (npages == 1)
+ 		return pfn;
+ 
+ 	down_read(&current->mm->mmap_sem);
+ 	if (npages == -EHWPOISON ||
+ 	      (!async && check_user_page_hwpoison(addr))) {
+ 		pfn = KVM_PFN_ERR_HWPOISON;
+ 		goto exit;
+ 	}
+ 
+ retry:
+ 	vma = find_vma_intersection(current->mm, addr, addr + 1);
+ 
+ 	if (vma == NULL)
+ 		pfn = KVM_PFN_ERR_FAULT;
+ 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+ 		r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
+ 		if (r == -EAGAIN)
+ 			goto retry;
+ 		if (r < 0)
+ 			pfn = KVM_PFN_ERR_FAULT;
+ 	} else {
+ 		if (async && vma_is_valid(vma, write_fault))
+ 			*async = true;
+ 		pfn = KVM_PFN_ERR_FAULT;
+ 	}
+ exit:
+ 	up_read(&current->mm->mmap_sem);
+ 	return pfn;
+ }
+ 
+ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
+ 			       bool atomic, bool *async, bool write_fault,
+ 			       bool *writable)
+ {
+ 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+ 
+ 	if (addr == KVM_HVA_ERR_RO_BAD) {
+ 		if (writable)
+ 			*writable = false;
+ 		return KVM_PFN_ERR_RO_FAULT;
+ 	}
+ 
+ 	if (kvm_is_error_hva(addr)) {
+ 		if (writable)
+ 			*writable = false;
+ 		return KVM_PFN_NOSLOT;
+ 	}
+ 
+ 	/* Do not map writable pfn in the readonly memslot. */
+ 	if (writable && memslot_is_readonly(slot)) {
+ 		*writable = false;
+ 		writable = NULL;
+ 	}
+ 
+ 	return hva_to_pfn(addr, atomic, async, write_fault,
+ 			  writable);
+ }
+ EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
+ 
+ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
+ 		      bool *writable)
+ {
+ 	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
+ 				    write_fault, writable);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
+ 
+ kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+ 	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
+ 
+ kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+ {
+ 	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
+ 
+ kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+ {
+ 	return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+ 
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
+ 
+ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+ {
+ 	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_pfn);
+ 
+ kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
+ 
+ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+ 			    struct page **pages, int nr_pages)
+ {
+ 	unsigned long addr;
+ 	gfn_t entry = 0;
+ 
+ 	addr = gfn_to_hva_many(slot, gfn, &entry);
+ 	if (kvm_is_error_hva(addr))
+ 		return -1;
+ 
+ 	if (entry < nr_pages)
+ 		return 0;
+ 
+ 	return __get_user_pages_fast(addr, nr_pages, 1, pages);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+ 
+ static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
+ {
+ 	if (is_error_noslot_pfn(pfn))
+ 		return KVM_ERR_PTR_BAD_PAGE;
+ 
+ 	if (kvm_is_reserved_pfn(pfn)) {
+ 		WARN_ON(1);
+ 		return KVM_ERR_PTR_BAD_PAGE;
+ 	}
+ 
+ 	return pfn_to_page(pfn);
+ }
+ 
+ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
+ {
+ 	kvm_pfn_t pfn;
+ 
+ 	pfn = gfn_to_pfn(kvm, gfn);
+ 
+ 	return kvm_pfn_to_page(pfn);
+ }
+ EXPORT_SYMBOL_GPL(gfn_to_page);
+ 
+ void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
+ {
+ 	if (pfn == 0)
+ 		return;
+ 
+ 	if (cache)
+ 		cache->pfn = cache->gfn = 0;
+ 
+ 	if (dirty)
+ 		kvm_release_pfn_dirty(pfn);
+ 	else
+ 		kvm_release_pfn_clean(pfn);
+ }
+ 
+ static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
+ 				 struct gfn_to_pfn_cache *cache, u64 gen)
+ {
+ 	kvm_release_pfn(cache->pfn, cache->dirty, cache);
+ 
+ 	cache->pfn = gfn_to_pfn_memslot(slot, gfn);
+ 	cache->gfn = gfn;
+ 	cache->dirty = false;
+ 	cache->generation = gen;
+ }
+ 
+ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+ 			 struct kvm_host_map *map,
+ 			 struct gfn_to_pfn_cache *cache,
+ 			 bool atomic)
+ {
+ 	kvm_pfn_t pfn;
+ 	void *hva = NULL;
+ 	struct page *page = KVM_UNMAPPED_PAGE;
+ 	struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
+ 	u64 gen = slots->generation;
+ 
+ 	if (!map)
+ 		return -EINVAL;
+ 
+ 	if (cache) {
+ 		if (!cache->pfn || cache->gfn != gfn ||
+ 			cache->generation != gen) {
+ 			if (atomic)
+ 				return -EAGAIN;
+ 			kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
+ 		}
+ 		pfn = cache->pfn;
+ 	} else {
+ 		if (atomic)
+ 			return -EAGAIN;
+ 		pfn = gfn_to_pfn_memslot(slot, gfn);
+ 	}
+ 	if (is_error_noslot_pfn(pfn))
+ 		return -EINVAL;
+ 
+ 	if (pfn_valid(pfn)) {
+ 		page = pfn_to_page(pfn);
+ 		if (atomic)
+ 			hva = kmap_atomic(page);
+ 		else
+ 			hva = kmap(page);
+ #ifdef CONFIG_HAS_IOMEM
+ 	} else if (!atomic) {
+ 		hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+ 	} else {
+ 		return -EINVAL;
+ #endif
+ 	}
+ 
+ 	if (!hva)
+ 		return -EFAULT;
+ 
+ 	map->page = page;
+ 	map->hva = hva;
+ 	map->pfn = pfn;
+ 	map->gfn = gfn;
+ 
+ 	return 0;
+ }
+ 
+ int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
+ 		struct gfn_to_pfn_cache *cache, bool atomic)
+ {
+ 	return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
+ 			cache, atomic);
+ }
+ EXPORT_SYMBOL_GPL(kvm_map_gfn);
+ 
+ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
+ {
+ 	return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
+ 		NULL, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_map);
+ 
+ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+ 			struct kvm_host_map *map,
+ 			struct gfn_to_pfn_cache *cache,
+ 			bool dirty, bool atomic)
+ {
+ 	if (!map)
+ 		return;
+ 
+ 	if (!map->hva)
+ 		return;
+ 
+ 	if (map->page != KVM_UNMAPPED_PAGE) {
+ 		if (atomic)
+ 			kunmap_atomic(map->hva);
+ 		else
+ 			kunmap(map->page);
+ 	}
+ #ifdef CONFIG_HAS_IOMEM
+ 	else if (!atomic)
+ 		memunmap(map->hva);
+ 	else
+ 		WARN_ONCE(1, "Unexpected unmapping in atomic context");
+ #endif
+ 
+ 	if (dirty)
+ 		mark_page_dirty_in_slot(memslot, map->gfn);
+ 
+ 	if (cache)
+ 		cache->dirty |= dirty;
+ 	else
+ 		kvm_release_pfn(map->pfn, dirty, NULL);
+ 
+ 	map->hva = NULL;
+ 	map->page = NULL;
+ }
+ 
+ int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+ 		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
+ {
+ 	__kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
+ 			cache, dirty, atomic);
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
+ 
+ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
+ {
+ 	__kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
+ 			dirty, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
+ 
+ struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	kvm_pfn_t pfn;
+ 
+ 	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
+ 
+ 	return kvm_pfn_to_page(pfn);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
+ 
+ void kvm_release_page_clean(struct page *page)
+ {
+ 	WARN_ON(is_error_page(page));
+ 
+ 	kvm_release_pfn_clean(page_to_pfn(page));
+ }
+ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+ 
+ void kvm_release_pfn_clean(kvm_pfn_t pfn)
+ {
+ 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
+ 		put_page(pfn_to_page(pfn));
+ }
+ EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
+ 
+ void kvm_release_page_dirty(struct page *page)
+ {
+ 	WARN_ON(is_error_page(page));
+ 
+ 	kvm_release_pfn_dirty(page_to_pfn(page));
+ }
+ EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+ 
+ void kvm_release_pfn_dirty(kvm_pfn_t pfn)
+ {
+ 	kvm_set_pfn_dirty(pfn);
+ 	kvm_release_pfn_clean(pfn);
+ }
+ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+ 
+ void kvm_set_pfn_dirty(kvm_pfn_t pfn)
+ {
+ 	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
+ 		struct page *page = pfn_to_page(pfn);
+ 
+ 		SetPageDirty(page);
+ 	}
+ }
+ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
+ 
+ void kvm_set_pfn_accessed(kvm_pfn_t pfn)
+ {
+ 	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
+ 		mark_page_accessed(pfn_to_page(pfn));
+ }
+ EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
+ 
+ void kvm_get_pfn(kvm_pfn_t pfn)
+ {
+ 	if (!kvm_is_reserved_pfn(pfn))
+ 		get_page(pfn_to_page(pfn));
+ }
+ EXPORT_SYMBOL_GPL(kvm_get_pfn);
+ 
+ static int next_segment(unsigned long len, int offset)
+ {
+ 	if (len > PAGE_SIZE - offset)
+ 		return PAGE_SIZE - offset;
+ 	else
+ 		return len;
+ }
+ 
+ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
+ 				 void *data, int offset, int len)
+ {
+ 	int r;
+ 	unsigned long addr;
+ 
+ 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
+ 	if (kvm_is_error_hva(addr))
+ 		return -EFAULT;
+ 	r = __copy_from_user(data, (void __user *)addr + offset, len);
+ 	if (r)
+ 		return -EFAULT;
+ 	return 0;
+ }
+ 
+ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+ 			int len)
+ {
+ 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ 
+ 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+ 
+ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
+ 			     int offset, int len)
+ {
+ 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ 
+ 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
+ 
+ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	int seg;
+ 	int offset = offset_in_page(gpa);
+ 	int ret;
+ 
+ 	while ((seg = next_segment(len, offset)) != 0) {
+ 		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
+ 		if (ret < 0)
+ 			return ret;
+ 		offset = 0;
+ 		len -= seg;
+ 		data += seg;
+ 		++gfn;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_read_guest);
+ 
+ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	int seg;
+ 	int offset = offset_in_page(gpa);
+ 	int ret;
+ 
+ 	while ((seg = next_segment(len, offset)) != 0) {
+ 		ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
+ 		if (ret < 0)
+ 			return ret;
+ 		offset = 0;
+ 		len -= seg;
+ 		data += seg;
+ 		++gfn;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
+ 
+ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+ 			           void *data, int offset, unsigned long len)
+ {
+ 	int r;
+ 	unsigned long addr;
+ 
+ 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
+ 	if (kvm_is_error_hva(addr))
+ 		return -EFAULT;
+ 	pagefault_disable();
+ 	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+ 	pagefault_enable();
+ 	if (r)
+ 		return -EFAULT;
+ 	return 0;
+ }
+ 
+ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+ 			  unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ 	int offset = offset_in_page(gpa);
+ 
+ 	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
+ 
+ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
+ 			       void *data, unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ 	int offset = offset_in_page(gpa);
+ 
+ 	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
+ 
+ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+ 			          const void *data, int offset, int len)
+ {
+ 	int r;
+ 	unsigned long addr;
+ 
+ 	addr = gfn_to_hva_memslot(memslot, gfn);
+ 	if (kvm_is_error_hva(addr))
+ 		return -EFAULT;
+ 	r = __copy_to_user((void __user *)addr + offset, data, len);
+ 	if (r)
+ 		return -EFAULT;
+ 	mark_page_dirty_in_slot(memslot, gfn);
+ 	return 0;
+ }
+ 
+ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
+ 			 const void *data, int offset, int len)
+ {
+ 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ 
+ 	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+ 
+ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ 			      const void *data, int offset, int len)
+ {
+ 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ 
+ 	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
+ 
+ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
+ 		    unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	int seg;
+ 	int offset = offset_in_page(gpa);
+ 	int ret;
+ 
+ 	while ((seg = next_segment(len, offset)) != 0) {
+ 		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
+ 		if (ret < 0)
+ 			return ret;
+ 		offset = 0;
+ 		len -= seg;
+ 		data += seg;
+ 		++gfn;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_write_guest);
+ 
+ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
+ 		         unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	int seg;
+ 	int offset = offset_in_page(gpa);
+ 	int ret;
+ 
+ 	while ((seg = next_segment(len, offset)) != 0) {
+ 		ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
+ 		if (ret < 0)
+ 			return ret;
+ 		offset = 0;
+ 		len -= seg;
+ 		data += seg;
+ 		++gfn;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
+ 
+ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
+ 				       struct gfn_to_hva_cache *ghc,
+ 				       gpa_t gpa, unsigned long len)
+ {
+ 	int offset = offset_in_page(gpa);
+ 	gfn_t start_gfn = gpa >> PAGE_SHIFT;
+ 	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
+ 	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
+ 	gfn_t nr_pages_avail;
+ 	int r = start_gfn <= end_gfn ? 0 : -EINVAL;
+ 
+ 	ghc->gpa = gpa;
+ 	ghc->generation = slots->generation;
+ 	ghc->len = len;
+ 	ghc->hva = KVM_HVA_ERR_BAD;
+ 
+ 	/*
+ 	 * If the requested region crosses two memslots, we still
+ 	 * verify that the entire region is valid here.
+ 	 */
+ 	while (!r && start_gfn <= end_gfn) {
+ 		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
+ 		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
+ 					   &nr_pages_avail);
+ 		if (kvm_is_error_hva(ghc->hva))
+ 			r = -EFAULT;
+ 		start_gfn += nr_pages_avail;
+ 	}
+ 
+ 	/* Use the slow path for cross page reads and writes. */
+ 	if (!r && nr_pages_needed == 1)
+ 		ghc->hva += offset;
+ 	else
+ 		ghc->memslot = NULL;
+ 
+ 	return r;
+ }
+ 
+ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ 			      gpa_t gpa, unsigned long len)
+ {
+ 	struct kvm_memslots *slots = kvm_memslots(kvm);
+ 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+ 
+ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ 				  void *data, unsigned int offset,
+ 				  unsigned long len)
+ {
+ 	struct kvm_memslots *slots = kvm_memslots(kvm);
+ 	int r;
+ 	gpa_t gpa = ghc->gpa + offset;
+ 
+ 	BUG_ON(len + offset > ghc->len);
+ 
+ 	if (slots->generation != ghc->generation)
+ 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
+ 
+ 	if (kvm_is_error_hva(ghc->hva))
+ 		return -EFAULT;
+ 
+ 	if (unlikely(!ghc->memslot))
+ 		return kvm_write_guest(kvm, gpa, data, len);
+ 
+ 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
+ 	if (r)
+ 		return -EFAULT;
+ 	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
+ 
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+ 
+ int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ 			   void *data, unsigned long len)
+ {
+ 	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+ 
+ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+ 			   void *data, unsigned long len)
+ {
+ 	struct kvm_memslots *slots = kvm_memslots(kvm);
+ 	int r;
+ 
+ 	BUG_ON(len > ghc->len);
+ 
+ 	if (slots->generation != ghc->generation)
+ 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
+ 
+ 	if (kvm_is_error_hva(ghc->hva))
+ 		return -EFAULT;
+ 
+ 	if (unlikely(!ghc->memslot))
+ 		return kvm_read_guest(kvm, ghc->gpa, data, len);
+ 
+ 	r = __copy_from_user(data, (void __user *)ghc->hva, len);
+ 	if (r)
+ 		return -EFAULT;
+ 
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+ 
+ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+ {
+ 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
+ 
+ 	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
+ }
+ EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+ 
+ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
+ {
+ 	gfn_t gfn = gpa >> PAGE_SHIFT;
+ 	int seg;
+ 	int offset = offset_in_page(gpa);
+ 	int ret;
+ 
+ 	while ((seg = next_segment(len, offset)) != 0) {
+ 		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+ 		if (ret < 0)
+ 			return ret;
+ 		offset = 0;
+ 		len -= seg;
+ 		++gfn;
+ 	}
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_clear_guest);
+ 
+ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
+ 				    gfn_t gfn)
+ {
+ 	if (memslot && memslot->dirty_bitmap) {
+ 		unsigned long rel_gfn = gfn - memslot->base_gfn;
+ 
+ 		set_bit_le(rel_gfn, memslot->dirty_bitmap);
+ 	}
+ }
+ 
+ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
+ {
+ 	struct kvm_memory_slot *memslot;
+ 
+ 	memslot = gfn_to_memslot(kvm, gfn);
+ 	mark_page_dirty_in_slot(memslot, gfn);
+ }
+ EXPORT_SYMBOL_GPL(mark_page_dirty);
+ 
+ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
+ {
+ 	struct kvm_memory_slot *memslot;
+ 
+ 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ 	mark_page_dirty_in_slot(memslot, gfn);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
+ 
+ void kvm_sigset_activate(struct kvm_vcpu *vcpu)
+ {
+ 	if (!vcpu->sigset_active)
+ 		return;
+ 
+ 	/*
+ 	 * This does a lockless modification of ->real_blocked, which is fine
+ 	 * because, only current can change ->real_blocked and all readers of
+ 	 * ->real_blocked don't care as long ->real_blocked is always a subset
+ 	 * of ->blocked.
+ 	 */
+ 	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
+ }
+ 
+ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
+ {
+ 	if (!vcpu->sigset_active)
+ 		return;
+ 
+ 	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
+ 	sigemptyset(&current->real_blocked);
+ }
+ 
+ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+ {
+ 	unsigned int old, val, grow, grow_start;
+ 
+ 	old = val = vcpu->halt_poll_ns;
+ 	grow_start = READ_ONCE(halt_poll_ns_grow_start);
+ 	grow = READ_ONCE(halt_poll_ns_grow);
+ 	if (!grow)
+ 		goto out;
+ 
+ 	val *= grow;
+ 	if (val < grow_start)
+ 		val = grow_start;
+ 
+ 	if (val > halt_poll_ns)
+ 		val = halt_poll_ns;
+ 
+ 	vcpu->halt_poll_ns = val;
+ out:
+ 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
+ }
+ 
+ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+ {
+ 	unsigned int old, val, shrink;
+ 
+ 	old = val = vcpu->halt_poll_ns;
+ 	shrink = READ_ONCE(halt_poll_ns_shrink);
+ 	if (shrink == 0)
+ 		val = 0;
+ 	else
+ 		val /= shrink;
+ 
+ 	vcpu->halt_poll_ns = val;
+ 	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
+ }
+ 
+ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
+ {
+ 	int ret = -EINTR;
+ 	int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ 
+ 	if (kvm_arch_vcpu_runnable(vcpu)) {
+ 		kvm_make_request(KVM_REQ_UNHALT, vcpu);
+ 		goto out;
+ 	}
+ 	if (kvm_cpu_has_pending_timer(vcpu))
+ 		goto out;
+ 	if (signal_pending(current))
+ 		goto out;
+ 
+ 	ret = 0;
+ out:
+ 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ 	return ret;
+ }
+ 
+ /*
+  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+  */
+ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+ {
+ 	ktime_t start, cur;
+ 	DECLARE_SWAITQUEUE(wait);
+ 	bool waited = false;
+ 	u64 block_ns;
+ 
+ 	kvm_arch_vcpu_blocking(vcpu);
+ 
+ 	start = cur = ktime_get();
+ 	if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) {
+ 		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
+ 
+ 		++vcpu->stat.halt_attempted_poll;
+ 		do {
+ 			/*
+ 			 * This sets KVM_REQ_UNHALT if an interrupt
+ 			 * arrives.
+ 			 */
+ 			if (kvm_vcpu_check_block(vcpu) < 0) {
+ 				++vcpu->stat.halt_successful_poll;
+ 				if (!vcpu_valid_wakeup(vcpu))
+ 					++vcpu->stat.halt_poll_invalid;
+ 				goto out;
+ 			}
+ 			cur = ktime_get();
+ 		} while (single_task_running() && ktime_before(cur, stop));
+ 	}
+ 
+ 	for (;;) {
+ 		prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+ 
+ 		if (kvm_vcpu_check_block(vcpu) < 0)
+ 			break;
+ 
+ 		waited = true;
+ 		schedule();
+ 	}
+ 
+ 	finish_swait(&vcpu->wq, &wait);
+ 	cur = ktime_get();
+ out:
+ 	kvm_arch_vcpu_unblocking(vcpu);
+ 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+ 
+ 	if (!kvm_arch_no_poll(vcpu)) {
+ 		if (!vcpu_valid_wakeup(vcpu)) {
+ 			shrink_halt_poll_ns(vcpu);
+ 		} else if (halt_poll_ns) {
+ 			if (block_ns <= vcpu->halt_poll_ns)
+ 				;
+ 			/* we had a long block, shrink polling */
+ 			else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+ 				shrink_halt_poll_ns(vcpu);
+ 			/* we had a short halt and our poll time is too small */
+ 			else if (vcpu->halt_poll_ns < halt_poll_ns &&
+ 				block_ns < halt_poll_ns)
+ 				grow_halt_poll_ns(vcpu);
+ 		} else {
+ 			vcpu->halt_poll_ns = 0;
+ 		}
+ 	}
+ 
+ 	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
+ 	kvm_arch_vcpu_block_finish(vcpu);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_block);
+ 
+ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
+ {
+ 	struct swait_queue_head *wqp;
+ 
+ 	wqp = kvm_arch_vcpu_wq(vcpu);
+ 	if (swq_has_sleeper(wqp)) {
+ 		swake_up_one(wqp);
+ 		WRITE_ONCE(vcpu->ready, true);
+ 		++vcpu->stat.halt_wakeup;
+ 		return true;
+ 	}
+ 
+ 	return false;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
+ 
+ #ifndef CONFIG_S390
+ /*
+  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
+  */
+ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+ {
+ 	int me;
+ 	int cpu = vcpu->cpu;
+ 
+ 	if (kvm_vcpu_wake_up(vcpu))
+ 		return;
+ 
+ 	me = get_cpu();
+ 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+ 		if (kvm_arch_vcpu_should_kick(vcpu))
+ 			smp_send_reschedule(cpu);
+ 	put_cpu();
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+ #endif /* !CONFIG_S390 */
+ 
+ int kvm_vcpu_yield_to(struct kvm_vcpu *target)
+ {
+ 	struct pid *pid;
+ 	struct task_struct *task = NULL;
+ 	int ret = 0;
+ 
+ 	rcu_read_lock();
+ 	pid = rcu_dereference(target->pid);
+ 	if (pid)
+ 		task = get_pid_task(pid, PIDTYPE_PID);
+ 	rcu_read_unlock();
+ 	if (!task)
+ 		return ret;
+ 	ret = yield_to(task, 1);
+ 	put_task_struct(task);
+ 
+ 	return ret;
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+ 
+ /*
+  * Helper that checks whether a VCPU is eligible for directed yield.
+  * Most eligible candidate to yield is decided by following heuristics:
+  *
+  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+  *  (preempted lock holder), indicated by @in_spin_loop.
+  *  Set at the beiginning and cleared at the end of interception/PLE handler.
+  *
+  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+  *  chance last time (mostly it has become eligible now since we have probably
+  *  yielded to lockholder in last iteration. This is done by toggling
+  *  @dy_eligible each time a VCPU checked for eligibility.)
+  *
+  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+  *  to preempted lock-holder could result in wrong VCPU selection and CPU
+  *  burning. Giving priority for a potential lock-holder increases lock
+  *  progress.
+  *
+  *  Since algorithm is based on heuristics, accessing another VCPU data without
+  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
+  *  and continue with next VCPU and so on.
+  */
+ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+ {
+ #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ 	bool eligible;
+ 
+ 	eligible = !vcpu->spin_loop.in_spin_loop ||
+ 		    vcpu->spin_loop.dy_eligible;
+ 
+ 	if (vcpu->spin_loop.in_spin_loop)
+ 		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+ 
+ 	return eligible;
+ #else
+ 	return true;
+ #endif
+ }
+ 
+ /*
+  * Unlike kvm_arch_vcpu_runnable, this function is called outside
+  * a vcpu_load/vcpu_put pair.  However, for most architectures
+  * kvm_arch_vcpu_runnable does not require vcpu_load.
+  */
+ bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+ {
+ 	return kvm_arch_vcpu_runnable(vcpu);
+ }
+ 
+ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
+ {
+ 	if (kvm_arch_dy_runnable(vcpu))
+ 		return true;
+ 
+ #ifdef CONFIG_KVM_ASYNC_PF
+ 	if (!list_empty_careful(&vcpu->async_pf.done))
+ 		return true;
+ #endif
+ 
+ 	return false;
+ }
+ 
+ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
+ {
+ 	struct kvm *kvm = me->kvm;
+ 	struct kvm_vcpu *vcpu;
+ 	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ 	int yielded = 0;
+ 	int try = 3;
+ 	int pass;
+ 	int i;
+ 
+ 	kvm_vcpu_set_in_spin_loop(me, true);
+ 	/*
+ 	 * We boost the priority of a VCPU that is runnable but not
+ 	 * currently running, because it got preempted by something
+ 	 * else and called schedule in __vcpu_run.  Hopefully that
+ 	 * VCPU is holding the lock that we need and will release it.
+ 	 * We approximate round-robin by starting at the last boosted VCPU.
+ 	 */
+ 	for (pass = 0; pass < 2 && !yielded && try; pass++) {
+ 		kvm_for_each_vcpu(i, vcpu, kvm) {
+ 			if (!pass && i <= last_boosted_vcpu) {
+ 				i = last_boosted_vcpu;
+ 				continue;
+ 			} else if (pass && i > last_boosted_vcpu)
+ 				break;
+ 			if (!READ_ONCE(vcpu->ready))
+ 				continue;
+ 			if (vcpu == me)
+ 				continue;
+ 			if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
+ 				continue;
+ 			if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
+ 				!kvm_arch_vcpu_in_kernel(vcpu))
+ 				continue;
+ 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+ 				continue;
+ 
+ 			yielded = kvm_vcpu_yield_to(vcpu);
+ 			if (yielded > 0) {
+ 				kvm->last_boosted_vcpu = i;
+ 				break;
+ 			} else if (yielded < 0) {
+ 				try--;
+ 				if (!try)
+ 					break;
+ 			}
+ 		}
+ 	}
+ 	kvm_vcpu_set_in_spin_loop(me, false);
+ 
+ 	/* Ensure vcpu is not eligible during next spinloop */
+ 	kvm_vcpu_set_dy_eligible(me, false);
+ }
+ EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+ 
+ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
+ {
+ 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
+ 	struct page *page;
+ 
+ 	if (vmf->pgoff == 0)
+ 		page = virt_to_page(vcpu->run);
+ #ifdef CONFIG_X86
+ 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
+ 		page = virt_to_page(vcpu->arch.pio_data);
+ #endif
+ #ifdef CONFIG_KVM_MMIO
+ 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
+ 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
+ #endif
+ 	else
+ 		return kvm_arch_vcpu_fault(vcpu, vmf);
+ 	get_page(page);
+ 	vmf->page = page;
+ 	return 0;
+ }
+ 
+ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
+ 	.fault = kvm_vcpu_fault,
+ };
+ 
+ static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
+ {
+ 	vma->vm_ops = &kvm_vcpu_vm_ops;
+ 	return 0;
+ }
+ 
+ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
+ {
+ 	struct kvm_vcpu *vcpu = filp->private_data;
+ 
+ 	debugfs_remove_recursive(vcpu->debugfs_dentry);
+ 	kvm_put_kvm(vcpu->kvm);
+ 	return 0;
+ }
+ 
+ static struct file_operations kvm_vcpu_fops = {
+ 	.release        = kvm_vcpu_release,
+ 	.unlocked_ioctl = kvm_vcpu_ioctl,
+ 	.mmap           = kvm_vcpu_mmap,
+ 	.llseek		= noop_llseek,
+ 	KVM_COMPAT(kvm_vcpu_compat_ioctl),
+ };
+ 
+ /*
+  * Allocates an inode for the vcpu.
+  */
+ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
+ {
+ 	char name[8 + 1 + ITOA_MAX_LEN + 1];
+ 
+ 	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
+ 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
+ }
+ 
+ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+ {
+ #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
+ 	char dir_name[ITOA_MAX_LEN * 2];
+ 
+ 	if (!debugfs_initialized())
+ 		return;
+ 
+ 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
+ 	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
+ 						  vcpu->kvm->debugfs_dentry);
+ 
+ 	kvm_arch_create_vcpu_debugfs(vcpu);
+ #endif
+ }
+ 
+ /*
+  * Creates some virtual cpus.  Good luck creating more than one.
+  */
+ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+ {
+ 	int r;
+ 	struct kvm_vcpu *vcpu;
+ 
+ 	if (id >= KVM_MAX_VCPU_ID)
+ 		return -EINVAL;
+ 
+ 	mutex_lock(&kvm->lock);
+ 	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+ 		mutex_unlock(&kvm->lock);
+ 		return -EINVAL;
+ 	}
+ 
+ 	kvm->created_vcpus++;
+ 	mutex_unlock(&kvm->lock);
+ 
+ 	vcpu = kvm_arch_vcpu_create(kvm, id);
+ 	if (IS_ERR(vcpu)) {
+ 		r = PTR_ERR(vcpu);
+ 		goto vcpu_decrement;
+ 	}
+ 
+ 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+ 
+ 	r = kvm_arch_vcpu_setup(vcpu);
+ 	if (r)
+ 		goto vcpu_destroy;
+ 
+ 	kvm_create_vcpu_debugfs(vcpu);
+ 
+ 	mutex_lock(&kvm->lock);
+ 	if (kvm_get_vcpu_by_id(kvm, id)) {
+ 		r = -EEXIST;
+ 		goto unlock_vcpu_destroy;
+ 	}
+ 
+ 	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+ 
+ 	/* Now it's all set up, let userspace reach it */
+ 	kvm_get_kvm(kvm);
+ 	r = create_vcpu_fd(vcpu);
+ 	if (r < 0) {
+ 		kvm_put_kvm(kvm);
+ 		goto unlock_vcpu_destroy;
+ 	}
+ 
+ 	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+ 
+ 	/*
+ 	 * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
+ 	 * before kvm->online_vcpu's incremented value.
+ 	 */
+ 	smp_wmb();
+ 	atomic_inc(&kvm->online_vcpus);
+ 
+ 	mutex_unlock(&kvm->lock);
+ 	kvm_arch_vcpu_postcreate(vcpu);
+ 	return r;
+ 
+ unlock_vcpu_destroy:
+ 	mutex_unlock(&kvm->lock);
+ 	debugfs_remove_recursive(vcpu->debugfs_dentry);
+ vcpu_destroy:
+ 	kvm_arch_vcpu_destroy(vcpu);
+ vcpu_decrement:
+ 	mutex_lock(&kvm->lock);
+ 	kvm->created_vcpus--;
+ 	mutex_unlock(&kvm->lock);
+ 	return r;
+ }
+ 
+ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
+ {
+ 	if (sigset) {
+ 		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ 		vcpu->sigset_active = 1;
+ 		vcpu->sigset = *sigset;
+ 	} else
+ 		vcpu->sigset_active = 0;
+ 	return 0;
+ }
+ 
+ static long kvm_vcpu_ioctl(struct file *filp,
+ 			   unsigned int ioctl, unsigned long arg)
+ {
+ 	struct kvm_vcpu *vcpu = filp->private_data;
+ 	void __user *argp = (void __user *)arg;
+ 	int r;
+ 	struct kvm_fpu *fpu = NULL;
+ 	struct kvm_sregs *kvm_sregs = NULL;
+ 
+ 	if (vcpu->kvm->mm != current->mm)
+ 		return -EIO;
+ 
+ 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
+ 		return -EINVAL;
+ 
+ 	/*
+ 	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
+ 	 * execution; mutex_lock() would break them.
+ 	 */
+ 	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
+ 	if (r != -ENOIOCTLCMD)
+ 		return r;
+ 
+ 	if (mutex_lock_killable(&vcpu->mutex))
+ 		return -EINTR;
+ 	switch (ioctl) {
+ 	case KVM_RUN: {
+ 		struct pid *oldpid;
+ 		r = -EINVAL;
+ 		if (arg)
+ 			goto out;
+ 		oldpid = rcu_access_pointer(vcpu->pid);
+ 		if (unlikely(oldpid != task_pid(current))) {
+ 			/* The thread running this VCPU changed. */
+ 			struct pid *newpid;
+ 
+ 			r = kvm_arch_vcpu_run_pid_change(vcpu);
+ 			if (r)
+ 				break;
+ 
+ 			newpid = get_task_pid(current, PIDTYPE_PID);
+ 			rcu_assign_pointer(vcpu->pid, newpid);
+ 			if (oldpid)
+ 				synchronize_rcu();
+ 			put_pid(oldpid);
+ 		}
+ 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+ 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
+ 		break;
+ 	}
+ 	case KVM_GET_REGS: {
+ 		struct kvm_regs *kvm_regs;
+ 
+ 		r = -ENOMEM;
+ 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
+ 		if (!kvm_regs)
+ 			goto out;
+ 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
+ 		if (r)
+ 			goto out_free1;
+ 		r = -EFAULT;
+ 		if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
+ 			goto out_free1;
+ 		r = 0;
+ out_free1:
+ 		kfree(kvm_regs);
+ 		break;
+ 	}
+ 	case KVM_SET_REGS: {
+ 		struct kvm_regs *kvm_regs;
+ 
+ 		r = -ENOMEM;
+ 		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
+ 		if (IS_ERR(kvm_regs)) {
+ 			r = PTR_ERR(kvm_regs);
+ 			goto out;
+ 		}
+ 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+ 		kfree(kvm_regs);
+ 		break;
+ 	}
+ 	case KVM_GET_SREGS: {
+ 		kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
+ 				    GFP_KERNEL_ACCOUNT);
+ 		r = -ENOMEM;
+ 		if (!kvm_sregs)
+ 			goto out;
+ 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
+ 		if (r)
+ 			goto out;
+ 		r = -EFAULT;
+ 		if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
+ 			goto out;
+ 		r = 0;
+ 		break;
+ 	}
+ 	case KVM_SET_SREGS: {
+ 		kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
+ 		if (IS_ERR(kvm_sregs)) {
+ 			r = PTR_ERR(kvm_sregs);
+ 			kvm_sregs = NULL;
+ 			goto out;
+ 		}
+ 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
+ 		break;
+ 	}
+ 	case KVM_GET_MP_STATE: {
+ 		struct kvm_mp_state mp_state;
+ 
+ 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
+ 		if (r)
+ 			goto out;
+ 		r = -EFAULT;
+ 		if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
+ 			goto out;
+ 		r = 0;
+ 		break;
+ 	}
+ 	case KVM_SET_MP_STATE: {
+ 		struct kvm_mp_state mp_state;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
+ 			goto out;
+ 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
+ 		break;
+ 	}
+ 	case KVM_TRANSLATE: {
+ 		struct kvm_translation tr;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&tr, argp, sizeof(tr)))
+ 			goto out;
+ 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+ 		if (r)
+ 			goto out;
+ 		r = -EFAULT;
+ 		if (copy_to_user(argp, &tr, sizeof(tr)))
+ 			goto out;
+ 		r = 0;
+ 		break;
+ 	}
+ 	case KVM_SET_GUEST_DEBUG: {
+ 		struct kvm_guest_debug dbg;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&dbg, argp, sizeof(dbg)))
+ 			goto out;
+ 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
+ 		break;
+ 	}
+ 	case KVM_SET_SIGNAL_MASK: {
+ 		struct kvm_signal_mask __user *sigmask_arg = argp;
+ 		struct kvm_signal_mask kvm_sigmask;
+ 		sigset_t sigset, *p;
+ 
+ 		p = NULL;
+ 		if (argp) {
+ 			r = -EFAULT;
+ 			if (copy_from_user(&kvm_sigmask, argp,
+ 					   sizeof(kvm_sigmask)))
+ 				goto out;
+ 			r = -EINVAL;
+ 			if (kvm_sigmask.len != sizeof(sigset))
+ 				goto out;
+ 			r = -EFAULT;
+ 			if (copy_from_user(&sigset, sigmask_arg->sigset,
+ 					   sizeof(sigset)))
+ 				goto out;
+ 			p = &sigset;
+ 		}
+ 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
+ 		break;
+ 	}
+ 	case KVM_GET_FPU: {
+ 		fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
+ 		r = -ENOMEM;
+ 		if (!fpu)
+ 			goto out;
+ 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
+ 		if (r)
+ 			goto out;
+ 		r = -EFAULT;
+ 		if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
+ 			goto out;
+ 		r = 0;
+ 		break;
+ 	}
+ 	case KVM_SET_FPU: {
+ 		fpu = memdup_user(argp, sizeof(*fpu));
+ 		if (IS_ERR(fpu)) {
+ 			r = PTR_ERR(fpu);
+ 			fpu = NULL;
+ 			goto out;
+ 		}
+ 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
+ 		break;
+ 	}
+ 	default:
+ 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+ 	}
+ out:
+ 	mutex_unlock(&vcpu->mutex);
+ 	kfree(fpu);
+ 	kfree(kvm_sregs);
+ 	return r;
+ }
+ 
+ #ifdef CONFIG_KVM_COMPAT
+ static long kvm_vcpu_compat_ioctl(struct file *filp,
+ 				  unsigned int ioctl, unsigned long arg)
+ {
+ 	struct kvm_vcpu *vcpu = filp->private_data;
+ 	void __user *argp = compat_ptr(arg);
+ 	int r;
+ 
+ 	if (vcpu->kvm->mm != current->mm)
+ 		return -EIO;
+ 
+ 	switch (ioctl) {
+ 	case KVM_SET_SIGNAL_MASK: {
+ 		struct kvm_signal_mask __user *sigmask_arg = argp;
+ 		struct kvm_signal_mask kvm_sigmask;
+ 		sigset_t sigset;
+ 
+ 		if (argp) {
+ 			r = -EFAULT;
+ 			if (copy_from_user(&kvm_sigmask, argp,
+ 					   sizeof(kvm_sigmask)))
+ 				goto out;
+ 			r = -EINVAL;
+ 			if (kvm_sigmask.len != sizeof(compat_sigset_t))
+ 				goto out;
+ 			r = -EFAULT;
+ 			if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
+ 				goto out;
+ 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+ 		} else
+ 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
+ 		break;
+ 	}
+ 	default:
+ 		r = kvm_vcpu_ioctl(filp, ioctl, arg);
+ 	}
+ 
+ out:
+ 	return r;
+ }
+ #endif
+ 
+ static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
+ {
+ 	struct kvm_device *dev = filp->private_data;
+ 
+ 	if (dev->ops->mmap)
+ 		return dev->ops->mmap(dev, vma);
+ 
+ 	return -ENODEV;
+ }
+ 
+ static int kvm_device_ioctl_attr(struct kvm_device *dev,
+ 				 int (*accessor)(struct kvm_device *dev,
+ 						 struct kvm_device_attr *attr),
+ 				 unsigned long arg)
+ {
+ 	struct kvm_device_attr attr;
+ 
+ 	if (!accessor)
+ 		return -EPERM;
+ 
+ 	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ 		return -EFAULT;
+ 
+ 	return accessor(dev, &attr);
+ }
+ 
+ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
+ 			     unsigned long arg)
+ {
+ 	struct kvm_device *dev = filp->private_data;
+ 
+ 	if (dev->kvm->mm != current->mm)
+ 		return -EIO;
+ 
+ 	switch (ioctl) {
+ 	case KVM_SET_DEVICE_ATTR:
+ 		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
+ 	case KVM_GET_DEVICE_ATTR:
+ 		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
+ 	case KVM_HAS_DEVICE_ATTR:
+ 		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
+ 	default:
+ 		if (dev->ops->ioctl)
+ 			return dev->ops->ioctl(dev, ioctl, arg);
+ 
+ 		return -ENOTTY;
+ 	}
+ }
+ 
+ static int kvm_device_release(struct inode *inode, struct file *filp)
+ {
+ 	struct kvm_device *dev = filp->private_data;
+ 	struct kvm *kvm = dev->kvm;
+ 
+ 	if (dev->ops->release) {
+ 		mutex_lock(&kvm->lock);
+ 		list_del(&dev->vm_node);
+ 		dev->ops->release(dev);
+ 		mutex_unlock(&kvm->lock);
+ 	}
+ 
+ 	kvm_put_kvm(kvm);
+ 	return 0;
+ }
+ 
+ static const struct file_operations kvm_device_fops = {
+ 	.unlocked_ioctl = kvm_device_ioctl,
+ 	.release = kvm_device_release,
+ 	KVM_COMPAT(kvm_device_ioctl),
+ 	.mmap = kvm_device_mmap,
+ };
+ 
+ struct kvm_device *kvm_device_from_filp(struct file *filp)
+ {
+ 	if (filp->f_op != &kvm_device_fops)
+ 		return NULL;
+ 
+ 	return filp->private_data;
+ }
+ 
+ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+ #ifdef CONFIG_KVM_MPIC
+ 	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
+ 	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
+ #endif
+ };
+ 
+ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+ {
+ 	if (type >= ARRAY_SIZE(kvm_device_ops_table))
+ 		return -ENOSPC;
+ 
+ 	if (kvm_device_ops_table[type] != NULL)
+ 		return -EEXIST;
+ 
+ 	kvm_device_ops_table[type] = ops;
+ 	return 0;
+ }
+ 
+ void kvm_unregister_device_ops(u32 type)
+ {
+ 	if (kvm_device_ops_table[type] != NULL)
+ 		kvm_device_ops_table[type] = NULL;
+ }
+ 
+ static int kvm_ioctl_create_device(struct kvm *kvm,
+ 				   struct kvm_create_device *cd)
+ {
+ 	struct kvm_device_ops *ops = NULL;
+ 	struct kvm_device *dev;
+ 	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+ 	int type;
+ 	int ret;
+ 
+ 	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
+ 		return -ENODEV;
+ 
+ 	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
+ 	ops = kvm_device_ops_table[type];
+ 	if (ops == NULL)
+ 		return -ENODEV;
+ 
+ 	if (test)
+ 		return 0;
+ 
+ 	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
+ 	if (!dev)
+ 		return -ENOMEM;
+ 
+ 	dev->ops = ops;
+ 	dev->kvm = kvm;
+ 
+ 	mutex_lock(&kvm->lock);
+ 	ret = ops->create(dev, type);
+ 	if (ret < 0) {
+ 		mutex_unlock(&kvm->lock);
+ 		kfree(dev);
+ 		return ret;
+ 	}
+ 	list_add(&dev->vm_node, &kvm->devices);
+ 	mutex_unlock(&kvm->lock);
+ 
+ 	if (ops->init)
+ 		ops->init(dev);
+ 
+ 	kvm_get_kvm(kvm);
+ 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
+ 	if (ret < 0) {
+ 		kvm_put_kvm(kvm);
+ 		mutex_lock(&kvm->lock);
+ 		list_del(&dev->vm_node);
+ 		mutex_unlock(&kvm->lock);
+ 		ops->destroy(dev);
+ 		return ret;
+ 	}
+ 
+ 	cd->fd = ret;
+ 	return 0;
+ }
+ 
+ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
+ {
+ 	switch (arg) {
+ 	case KVM_CAP_USER_MEMORY:
+ 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+ 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
+ 	case KVM_CAP_INTERNAL_ERROR_DATA:
+ #ifdef CONFIG_HAVE_KVM_MSI
+ 	case KVM_CAP_SIGNAL_MSI:
+ #endif
+ #ifdef CONFIG_HAVE_KVM_IRQFD
+ 	case KVM_CAP_IRQFD:
+ 	case KVM_CAP_IRQFD_RESAMPLE:
+ #endif
+ 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
+ 	case KVM_CAP_CHECK_EXTENSION_VM:
+ 	case KVM_CAP_ENABLE_CAP_VM:
+ #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ 	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
+ #endif
+ 		return 1;
+ #ifdef CONFIG_KVM_MMIO
+ 	case KVM_CAP_COALESCED_MMIO:
+ 		return KVM_COALESCED_MMIO_PAGE_OFFSET;
+ 	case KVM_CAP_COALESCED_PIO:
+ 		return 1;
+ #endif
+ #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+ 	case KVM_CAP_IRQ_ROUTING:
+ 		return KVM_MAX_IRQ_ROUTES;
+ #endif
+ #if KVM_ADDRESS_SPACE_NUM > 1
+ 	case KVM_CAP_MULTI_ADDRESS_SPACE:
+ 		return KVM_ADDRESS_SPACE_NUM;
+ #endif
+ 	case KVM_CAP_NR_MEMSLOTS:
+ 		return KVM_USER_MEM_SLOTS;
+ 	default:
+ 		break;
+ 	}
+ 	return kvm_vm_ioctl_check_extension(kvm, arg);
+ }
+ 
+ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ 						  struct kvm_enable_cap *cap)
+ {
+ 	return -EINVAL;
+ }
+ 
+ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
+ 					   struct kvm_enable_cap *cap)
+ {
+ 	switch (cap->cap) {
+ #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ 	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
+ 		if (cap->flags || (cap->args[0] & ~1))
+ 			return -EINVAL;
+ 		kvm->manual_dirty_log_protect = cap->args[0];
+ 		return 0;
+ #endif
+ 	default:
+ 		return kvm_vm_ioctl_enable_cap(kvm, cap);
+ 	}
+ }
+ 
+ static long kvm_vm_ioctl(struct file *filp,
+ 			   unsigned int ioctl, unsigned long arg)
+ {
+ 	struct kvm *kvm = filp->private_data;
+ 	void __user *argp = (void __user *)arg;
+ 	int r;
+ 
+ 	if (kvm->mm != current->mm)
+ 		return -EIO;
+ 	switch (ioctl) {
+ 	case KVM_CREATE_VCPU:
+ 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+ 		break;
+ 	case KVM_ENABLE_CAP: {
+ 		struct kvm_enable_cap cap;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&cap, argp, sizeof(cap)))
+ 			goto out;
+ 		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
+ 		break;
+ 	}
+ 	case KVM_SET_USER_MEMORY_REGION: {
+ 		struct kvm_userspace_memory_region kvm_userspace_mem;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&kvm_userspace_mem, argp,
+ 						sizeof(kvm_userspace_mem)))
+ 			goto out;
+ 
+ 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+ 		break;
+ 	}
+ 	case KVM_GET_DIRTY_LOG: {
+ 		struct kvm_dirty_log log;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&log, argp, sizeof(log)))
+ 			goto out;
+ 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+ 		break;
+ 	}
+ #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ 	case KVM_CLEAR_DIRTY_LOG: {
+ 		struct kvm_clear_dirty_log log;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&log, argp, sizeof(log)))
+ 			goto out;
+ 		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
+ 		break;
+ 	}
+ #endif
+ #ifdef CONFIG_KVM_MMIO
+ 	case KVM_REGISTER_COALESCED_MMIO: {
+ 		struct kvm_coalesced_mmio_zone zone;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&zone, argp, sizeof(zone)))
+ 			goto out;
+ 		r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
+ 		break;
+ 	}
+ 	case KVM_UNREGISTER_COALESCED_MMIO: {
+ 		struct kvm_coalesced_mmio_zone zone;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&zone, argp, sizeof(zone)))
+ 			goto out;
+ 		r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
+ 		break;
+ 	}
+ #endif
+ 	case KVM_IRQFD: {
+ 		struct kvm_irqfd data;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&data, argp, sizeof(data)))
+ 			goto out;
+ 		r = kvm_irqfd(kvm, &data);
+ 		break;
+ 	}
+ 	case KVM_IOEVENTFD: {
+ 		struct kvm_ioeventfd data;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&data, argp, sizeof(data)))
+ 			goto out;
+ 		r = kvm_ioeventfd(kvm, &data);
+ 		break;
+ 	}
+ #ifdef CONFIG_HAVE_KVM_MSI
+ 	case KVM_SIGNAL_MSI: {
+ 		struct kvm_msi msi;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&msi, argp, sizeof(msi)))
+ 			goto out;
+ 		r = kvm_send_userspace_msi(kvm, &msi);
+ 		break;
+ 	}
+ #endif
+ #ifdef __KVM_HAVE_IRQ_LINE
+ 	case KVM_IRQ_LINE_STATUS:
+ 	case KVM_IRQ_LINE: {
+ 		struct kvm_irq_level irq_event;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
+ 			goto out;
+ 
+ 		r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
+ 					ioctl == KVM_IRQ_LINE_STATUS);
+ 		if (r)
+ 			goto out;
+ 
+ 		r = -EFAULT;
+ 		if (ioctl == KVM_IRQ_LINE_STATUS) {
+ 			if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
+ 				goto out;
+ 		}
+ 
+ 		r = 0;
+ 		break;
+ 	}
+ #endif
+ #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+ 	case KVM_SET_GSI_ROUTING: {
+ 		struct kvm_irq_routing routing;
+ 		struct kvm_irq_routing __user *urouting;
+ 		struct kvm_irq_routing_entry *entries = NULL;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&routing, argp, sizeof(routing)))
+ 			goto out;
+ 		r = -EINVAL;
+ 		if (!kvm_arch_can_set_irq_routing(kvm))
+ 			goto out;
+ 		if (routing.nr > KVM_MAX_IRQ_ROUTES)
+ 			goto out;
+ 		if (routing.flags)
+ 			goto out;
+ 		if (routing.nr) {
+ 			r = -ENOMEM;
+ 			entries = vmalloc(array_size(sizeof(*entries),
+ 						     routing.nr));
+ 			if (!entries)
+ 				goto out;
+ 			r = -EFAULT;
+ 			urouting = argp;
+ 			if (copy_from_user(entries, urouting->entries,
+ 					   routing.nr * sizeof(*entries)))
+ 				goto out_free_irq_routing;
+ 		}
+ 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
+ 					routing.flags);
+ out_free_irq_routing:
+ 		vfree(entries);
+ 		break;
+ 	}
+ #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+ 	case KVM_CREATE_DEVICE: {
+ 		struct kvm_create_device cd;
+ 
+ 		r = -EFAULT;
+ 		if (copy_from_user(&cd, argp, sizeof(cd)))
+ 			goto out;
+ 
+ 		r = kvm_ioctl_create_device(kvm, &cd);
+ 		if (r)
+ 			goto out;
+ 
+ 		r = -EFAULT;
+ 		if (copy_to_user(argp, &cd, sizeof(cd)))
+ 			goto out;
+ 
+ 		r = 0;
+ 		break;
+ 	}
+ 	case KVM_CHECK_EXTENSION:
+ 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
+ 		break;
+ 	default:
+ 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
+ 	}
+ out:
+ 	return r;
+ }
+ 
+ #ifdef CONFIG_KVM_COMPAT
+ struct compat_kvm_dirty_log {
+ 	__u32 slot;
+ 	__u32 padding1;
+ 	union {
+ 		compat_uptr_t dirty_bitmap; /* one bit per page */
+ 		__u64 padding2;
+ 	};
+ };
+ 
+ static long kvm_vm_compat_ioctl(struct file *filp,
+ 			   unsigned int ioctl, unsigned long arg)
+ {
+ 	struct kvm *kvm = filp->private_data;
+ 	int r;
+ 
+ 	if (kvm->mm != current->mm)
+ 		return -EIO;
+ 	switch (ioctl) {
+ 	case KVM_GET_DIRTY_LOG: {
+ 		struct compat_kvm_dirty_log compat_log;
+ 		struct kvm_dirty_log log;
+ 
+ 		if (copy_from_user(&compat_log, (void __user *)arg,
+ 				   sizeof(compat_log)))
+ 			return -EFAULT;
+ 		log.slot	 = compat_log.slot;
+ 		log.padding1	 = compat_log.padding1;
+ 		log.padding2	 = compat_log.padding2;
+ 		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+ 
+ 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
+ 		break;
+ 	}
+ 	default:
+ 		r = kvm_vm_ioctl(filp, ioctl, arg);
+ 	}
+ 	return r;
+ }
+ #endif
+ 
+ static struct file_operations kvm_vm_fops = {
+ 	.release        = kvm_vm_release,
+ 	.unlocked_ioctl = kvm_vm_ioctl,
+ 	.llseek		= noop_llseek,
+ 	KVM_COMPAT(kvm_vm_compat_ioctl),
+ };
+ 
+ static int kvm_dev_ioctl_create_vm(unsigned long type)
+ {
+ 	int r;
+ 	struct kvm *kvm;
+ 	struct file *file;
+ 
+ 	kvm = kvm_create_vm(type);
+ 	if (IS_ERR(kvm))
+ 		return PTR_ERR(kvm);
+ #ifdef CONFIG_KVM_MMIO
+ 	r = kvm_coalesced_mmio_init(kvm);
+ 	if (r < 0)
+ 		goto put_kvm;
+ #endif
+ 	r = get_unused_fd_flags(O_CLOEXEC);
+ 	if (r < 0)
+ 		goto put_kvm;
+ 
+ 	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
+ 	if (IS_ERR(file)) {
+ 		put_unused_fd(r);
+ 		r = PTR_ERR(file);
+ 		goto put_kvm;
+ 	}
+ 
+ 	/*
+ 	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
+ 	 * already set, with ->release() being kvm_vm_release().  In error
+ 	 * cases it will be called by the final fput(file) and will take
+ 	 * care of doing kvm_put_kvm(kvm).
+ 	 */
+ 	if (kvm_create_vm_debugfs(kvm, r) < 0) {
+ 		put_unused_fd(r);
+ 		fput(file);
+ 		return -ENOMEM;
+ 	}
+ 	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
+ 
+ 	fd_install(r, file);
+ 	return r;
+ 
+ put_kvm:
+ 	kvm_put_kvm(kvm);
+ 	return r;
+ }
+ 
+ static long kvm_dev_ioctl(struct file *filp,
+ 			  unsigned int ioctl, unsigned long arg)
+ {
+ 	long r = -EINVAL;
+ 
+ 	switch (ioctl) {
+ 	case KVM_GET_API_VERSION:
+ 		if (arg)
+ 			goto out;
+ 		r = KVM_API_VERSION;
+ 		break;
+ 	case KVM_CREATE_VM:
+ 		r = kvm_dev_ioctl_create_vm(arg);
+ 		break;
+ 	case KVM_CHECK_EXTENSION:
+ 		r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
+ 		break;
+ 	case KVM_GET_VCPU_MMAP_SIZE:
+ 		if (arg)
+ 			goto out;
+ 		r = PAGE_SIZE;     /* struct kvm_run */
+ #ifdef CONFIG_X86
+ 		r += PAGE_SIZE;    /* pio data page */
+ #endif
+ #ifdef CONFIG_KVM_MMIO
+ 		r += PAGE_SIZE;    /* coalesced mmio ring page */
+ #endif
+ 		break;
+ 	case KVM_TRACE_ENABLE:
+ 	case KVM_TRACE_PAUSE:
+ 	case KVM_TRACE_DISABLE:
+ 		r = -EOPNOTSUPP;
+ 		break;
+ 	default:
+ 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
+ 	}
+ out:
+ 	return r;
+ }
+ 
+ static struct file_operations kvm_chardev_ops = {
+ 	.unlocked_ioctl = kvm_dev_ioctl,
+ 	.llseek		= noop_llseek,
+ 	KVM_COMPAT(kvm_dev_ioctl),
+ };
+ 
+ static struct miscdevice kvm_dev = {
+ 	KVM_MINOR,
+ 	"kvm",
+ 	&kvm_chardev_ops,
+ };
+ 
+ static void hardware_enable_nolock(void *junk)
+ {
+ 	int cpu = raw_smp_processor_id();
+ 	int r;
+ 
+ 	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
+ 		return;
+ 
+ 	cpumask_set_cpu(cpu, cpus_hardware_enabled);
+ 
+ 	r = kvm_arch_hardware_enable();
+ 
+ 	if (r) {
+ 		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+ 		atomic_inc(&hardware_enable_failed);
+ 		pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
+ 	}
+ }
+ 
+ static int kvm_starting_cpu(unsigned int cpu)
+ {
+ 	raw_spin_lock(&kvm_count_lock);
+ 	if (kvm_usage_count)
+ 		hardware_enable_nolock(NULL);
+ 	raw_spin_unlock(&kvm_count_lock);
+ 	return 0;
+ }
+ 
+ static void hardware_disable_nolock(void *junk)
+ {
+ 	int cpu = raw_smp_processor_id();
+ 
+ 	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
+ 		return;
+ 	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
+ 	kvm_arch_hardware_disable();
+ }
+ 
+ static int kvm_dying_cpu(unsigned int cpu)
+ {
+ 	raw_spin_lock(&kvm_count_lock);
+ 	if (kvm_usage_count)
+ 		hardware_disable_nolock(NULL);
+ 	raw_spin_unlock(&kvm_count_lock);
+ 	return 0;
+ }
+ 
+ static void hardware_disable_all_nolock(void)
+ {
+ 	BUG_ON(!kvm_usage_count);
+ 
+ 	kvm_usage_count--;
+ 	if (!kvm_usage_count)
+ 		on_each_cpu(hardware_disable_nolock, NULL, 1);
+ }
+ 
+ static void hardware_disable_all(void)
+ {
+ 	raw_spin_lock(&kvm_count_lock);
+ 	hardware_disable_all_nolock();
+ 	raw_spin_unlock(&kvm_count_lock);
+ }
+ 
+ static int hardware_enable_all(void)
+ {
+ 	int r = 0;
+ 
+ 	raw_spin_lock(&kvm_count_lock);
+ 
+ 	kvm_usage_count++;
+ 	if (kvm_usage_count == 1) {
+ 		atomic_set(&hardware_enable_failed, 0);
+ 		on_each_cpu(hardware_enable_nolock, NULL, 1);
+ 
+ 		if (atomic_read(&hardware_enable_failed)) {
+ 			hardware_disable_all_nolock();
+ 			r = -EBUSY;
+ 		}
+ 	}
+ 
+ 	raw_spin_unlock(&kvm_count_lock);
+ 
+ 	return r;
+ }
+ 
+ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
+ 		      void *v)
+ {
+ 	/*
+ 	 * Some (well, at least mine) BIOSes hang on reboot if
+ 	 * in vmx root mode.
+ 	 *
+ 	 * And Intel TXT required VMX off for all cpu when system shutdown.
+ 	 */
+ 	pr_info("kvm: exiting hardware virtualization\n");
+ 	kvm_rebooting = true;
+ 	on_each_cpu(hardware_disable_nolock, NULL, 1);
+ 	return NOTIFY_OK;
+ }
+ 
+ static struct notifier_block kvm_reboot_notifier = {
+ 	.notifier_call = kvm_reboot,
+ 	.priority = 0,
+ };
+ 
+ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < bus->dev_count; i++) {
+ 		struct kvm_io_device *pos = bus->range[i].dev;
+ 
+ 		kvm_iodevice_destructor(pos);
+ 	}
+ 	kfree(bus);
+ }
+ 
+ static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
+ 				 const struct kvm_io_range *r2)
+ {
+ 	gpa_t addr1 = r1->addr;
+ 	gpa_t addr2 = r2->addr;
+ 
+ 	if (addr1 < addr2)
+ 		return -1;
+ 
+ 	/* If r2->len == 0, match the exact address.  If r2->len != 0,
+ 	 * accept any overlapping write.  Any order is acceptable for
+ 	 * overlapping ranges, because kvm_io_bus_get_first_dev ensures
+ 	 * we process all of them.
+ 	 */
+ 	if (r2->len) {
+ 		addr1 += r1->len;
+ 		addr2 += r2->len;
+ 	}
+ 
+ 	if (addr1 > addr2)
+ 		return 1;
+ 
+ 	return 0;
+ }
+ 
+ static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+ {
+ 	return kvm_io_bus_cmp(p1, p2);
+ }
+ 
+ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+ 			     gpa_t addr, int len)
+ {
+ 	struct kvm_io_range *range, key;
+ 	int off;
+ 
+ 	key = (struct kvm_io_range) {
+ 		.addr = addr,
+ 		.len = len,
+ 	};
+ 
+ 	range = bsearch(&key, bus->range, bus->dev_count,
+ 			sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
+ 	if (range == NULL)
+ 		return -ENOENT;
+ 
+ 	off = range - bus->range;
+ 
+ 	while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
+ 		off--;
+ 
+ 	return off;
+ }
+ 
+ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
+ 			      struct kvm_io_range *range, const void *val)
+ {
+ 	int idx;
+ 
+ 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
+ 	if (idx < 0)
+ 		return -EOPNOTSUPP;
+ 
+ 	while (idx < bus->dev_count &&
+ 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+ 		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
+ 					range->len, val))
+ 			return idx;
+ 		idx++;
+ 	}
+ 
+ 	return -EOPNOTSUPP;
+ }
+ 
+ /* kvm_io_bus_write - called under kvm->slots_lock */
+ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+ 		     int len, const void *val)
+ {
+ 	struct kvm_io_bus *bus;
+ 	struct kvm_io_range range;
+ 	int r;
+ 
+ 	range = (struct kvm_io_range) {
+ 		.addr = addr,
+ 		.len = len,
+ 	};
+ 
+ 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+ 	if (!bus)
+ 		return -ENOMEM;
+ 	r = __kvm_io_bus_write(vcpu, bus, &range, val);
+ 	return r < 0 ? r : 0;
+ }
+ EXPORT_SYMBOL_GPL(kvm_io_bus_write);
+ 
+ /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
+ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
+ 			    gpa_t addr, int len, const void *val, long cookie)
+ {
+ 	struct kvm_io_bus *bus;
+ 	struct kvm_io_range range;
+ 
+ 	range = (struct kvm_io_range) {
+ 		.addr = addr,
+ 		.len = len,
+ 	};
+ 
+ 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+ 	if (!bus)
+ 		return -ENOMEM;
+ 
+ 	/* First try the device referenced by cookie. */
+ 	if ((cookie >= 0) && (cookie < bus->dev_count) &&
+ 	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
+ 		if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
+ 					val))
+ 			return cookie;
+ 
+ 	/*
+ 	 * cookie contained garbage; fall back to search and return the
+ 	 * correct cookie value.
+ 	 */
+ 	return __kvm_io_bus_write(vcpu, bus, &range, val);
+ }
+ 
+ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
+ 			     struct kvm_io_range *range, void *val)
+ {
+ 	int idx;
+ 
+ 	idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
+ 	if (idx < 0)
+ 		return -EOPNOTSUPP;
+ 
+ 	while (idx < bus->dev_count &&
+ 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
+ 		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
+ 				       range->len, val))
+ 			return idx;
+ 		idx++;
+ 	}
+ 
+ 	return -EOPNOTSUPP;
+ }
+ 
+ /* kvm_io_bus_read - called under kvm->slots_lock */
+ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+ 		    int len, void *val)
+ {
+ 	struct kvm_io_bus *bus;
+ 	struct kvm_io_range range;
+ 	int r;
+ 
+ 	range = (struct kvm_io_range) {
+ 		.addr = addr,
+ 		.len = len,
+ 	};
+ 
+ 	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+ 	if (!bus)
+ 		return -ENOMEM;
+ 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
+ 	return r < 0 ? r : 0;
+ }
+ 
+ /* Caller must hold slots_lock. */
+ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+ 			    int len, struct kvm_io_device *dev)
+ {
+ 	int i;
+ 	struct kvm_io_bus *new_bus, *bus;
+ 	struct kvm_io_range range;
+ 
+ 	bus = kvm_get_bus(kvm, bus_idx);
+ 	if (!bus)
+ 		return -ENOMEM;
+ 
+ 	/* exclude ioeventfd which is limited by maximum fd */
+ 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
+ 		return -ENOSPC;
+ 
+ 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
+ 			  GFP_KERNEL_ACCOUNT);
+ 	if (!new_bus)
+ 		return -ENOMEM;
+ 
+ 	range = (struct kvm_io_range) {
+ 		.addr = addr,
+ 		.len = len,
+ 		.dev = dev,
+ 	};
+ 
+ 	for (i = 0; i < bus->dev_count; i++)
+ 		if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
+ 			break;
+ 
+ 	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+ 	new_bus->dev_count++;
+ 	new_bus->range[i] = range;
+ 	memcpy(new_bus->range + i + 1, bus->range + i,
+ 		(bus->dev_count - i) * sizeof(struct kvm_io_range));
+ 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ 	synchronize_srcu_expedited(&kvm->srcu);
+ 	kfree(bus);
+ 
+ 	return 0;
+ }
+ 
+ /* Caller must hold slots_lock. */
+ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ 			       struct kvm_io_device *dev)
+ {
+ 	int i, j;
+ 	struct kvm_io_bus *new_bus, *bus;
+ 
+ 	bus = kvm_get_bus(kvm, bus_idx);
+ 	if (!bus)
+ 		return;
+ 
+ 	for (i = 0; i < bus->dev_count; i++)
+ 		if (bus->range[i].dev == dev) {
+ 			break;
+ 		}
+ 
+ 	if (i == bus->dev_count)
+ 		return;
+ 
+ 	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
+ 			  GFP_KERNEL_ACCOUNT);
+ 	if (new_bus) {
+ 		memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+ 		new_bus->dev_count--;
+ 		memcpy(new_bus->range + i, bus->range + i + 1,
+ 		       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
+ 	} else {
+ 		pr_err("kvm: failed to shrink bus, removing it completely\n");
+ 		for (j = 0; j < bus->dev_count; j++) {
+ 			if (j == i)
+ 				continue;
+ 			kvm_iodevice_destructor(bus->range[j].dev);
+ 		}
+ 	}
+ 
+ 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
+ 	synchronize_srcu_expedited(&kvm->srcu);
+ 	kfree(bus);
+ 	return;
+ }
+ 
+ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ 					 gpa_t addr)
+ {
+ 	struct kvm_io_bus *bus;
+ 	int dev_idx, srcu_idx;
+ 	struct kvm_io_device *iodev = NULL;
+ 
+ 	srcu_idx = srcu_read_lock(&kvm->srcu);
+ 
+ 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+ 	if (!bus)
+ 		goto out_unlock;
+ 
+ 	dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+ 	if (dev_idx < 0)
+ 		goto out_unlock;
+ 
+ 	iodev = bus->range[dev_idx].dev;
+ 
+ out_unlock:
+ 	srcu_read_unlock(&kvm->srcu, srcu_idx);
+ 
+ 	return iodev;
+ }
+ EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+ 
+ static int kvm_debugfs_open(struct inode *inode, struct file *file,
+ 			   int (*get)(void *, u64 *), int (*set)(void *, u64),
+ 			   const char *fmt)
+ {
+ 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+ 					  inode->i_private;
+ 
+ 	/* The debugfs files are a reference to the kvm struct which
+ 	 * is still valid when kvm_destroy_vm is called.
+ 	 * To avoid the race between open and the removal of the debugfs
+ 	 * directory we test against the users count.
+ 	 */
+ 	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
+ 		return -ENOENT;
+ 
+ 	if (simple_attr_open(inode, file, get,
+ 			     stat_data->mode & S_IWUGO ? set : NULL,
+ 			     fmt)) {
+ 		kvm_put_kvm(stat_data->kvm);
+ 		return -ENOMEM;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ static int kvm_debugfs_release(struct inode *inode, struct file *file)
+ {
+ 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
+ 					  inode->i_private;
+ 
+ 	simple_attr_release(inode, file);
+ 	kvm_put_kvm(stat_data->kvm);
+ 
+ 	return 0;
+ }
+ 
+ static int vm_stat_get_per_vm(void *data, u64 *val)
+ {
+ 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+ 
+ 	*val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
+ 
+ 	return 0;
+ }
+ 
+ static int vm_stat_clear_per_vm(void *data, u64 val)
+ {
+ 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+ 
+ 	if (val)
+ 		return -EINVAL;
+ 
+ 	*(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
+ 
+ 	return 0;
+ }
+ 
+ static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+ {
+ 	__simple_attr_check_format("%llu\n", 0ull);
+ 	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
+ 				vm_stat_clear_per_vm, "%llu\n");
+ }
+ 
+ static const struct file_operations vm_stat_get_per_vm_fops = {
+ 	.owner   = THIS_MODULE,
+ 	.open    = vm_stat_get_per_vm_open,
+ 	.release = kvm_debugfs_release,
+ 	.read    = simple_attr_read,
+ 	.write   = simple_attr_write,
+ 	.llseek  = no_llseek,
+ };
+ 
+ static int vcpu_stat_get_per_vm(void *data, u64 *val)
+ {
+ 	int i;
+ 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+ 	struct kvm_vcpu *vcpu;
+ 
+ 	*val = 0;
+ 
+ 	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+ 		*val += *(u64 *)((void *)vcpu + stat_data->offset);
+ 
+ 	return 0;
+ }
+ 
+ static int vcpu_stat_clear_per_vm(void *data, u64 val)
+ {
+ 	int i;
+ 	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+ 	struct kvm_vcpu *vcpu;
+ 
+ 	if (val)
+ 		return -EINVAL;
+ 
+ 	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
+ 		*(u64 *)((void *)vcpu + stat_data->offset) = 0;
+ 
+ 	return 0;
+ }
+ 
+ static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+ {
+ 	__simple_attr_check_format("%llu\n", 0ull);
+ 	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
+ 				 vcpu_stat_clear_per_vm, "%llu\n");
+ }
+ 
+ static const struct file_operations vcpu_stat_get_per_vm_fops = {
+ 	.owner   = THIS_MODULE,
+ 	.open    = vcpu_stat_get_per_vm_open,
+ 	.release = kvm_debugfs_release,
+ 	.read    = simple_attr_read,
+ 	.write   = simple_attr_write,
+ 	.llseek  = no_llseek,
+ };
+ 
+ static const struct file_operations *stat_fops_per_vm[] = {
+ 	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
+ 	[KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+ };
+ 
+ static int vm_stat_get(void *_offset, u64 *val)
+ {
+ 	unsigned offset = (long)_offset;
+ 	struct kvm *kvm;
+ 	struct kvm_stat_data stat_tmp = {.offset = offset};
+ 	u64 tmp_val;
+ 
+ 	*val = 0;
+ 	mutex_lock(&kvm_lock);
+ 	list_for_each_entry(kvm, &vm_list, vm_list) {
+ 		stat_tmp.kvm = kvm;
+ 		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+ 		*val += tmp_val;
+ 	}
+ 	mutex_unlock(&kvm_lock);
+ 	return 0;
+ }
+ 
+ static int vm_stat_clear(void *_offset, u64 val)
+ {
+ 	unsigned offset = (long)_offset;
+ 	struct kvm *kvm;
+ 	struct kvm_stat_data stat_tmp = {.offset = offset};
+ 
+ 	if (val)
+ 		return -EINVAL;
+ 
+ 	mutex_lock(&kvm_lock);
+ 	list_for_each_entry(kvm, &vm_list, vm_list) {
+ 		stat_tmp.kvm = kvm;
+ 		vm_stat_clear_per_vm((void *)&stat_tmp, 0);
+ 	}
+ 	mutex_unlock(&kvm_lock);
+ 
+ 	return 0;
+ }
+ 
+ DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
+ 
+ static int vcpu_stat_get(void *_offset, u64 *val)
+ {
+ 	unsigned offset = (long)_offset;
+ 	struct kvm *kvm;
+ 	struct kvm_stat_data stat_tmp = {.offset = offset};
+ 	u64 tmp_val;
+ 
+ 	*val = 0;
+ 	mutex_lock(&kvm_lock);
+ 	list_for_each_entry(kvm, &vm_list, vm_list) {
+ 		stat_tmp.kvm = kvm;
+ 		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+ 		*val += tmp_val;
+ 	}
+ 	mutex_unlock(&kvm_lock);
+ 	return 0;
+ }
+ 
+ static int vcpu_stat_clear(void *_offset, u64 val)
+ {
+ 	unsigned offset = (long)_offset;
+ 	struct kvm *kvm;
+ 	struct kvm_stat_data stat_tmp = {.offset = offset};
+ 
+ 	if (val)
+ 		return -EINVAL;
+ 
+ 	mutex_lock(&kvm_lock);
+ 	list_for_each_entry(kvm, &vm_list, vm_list) {
+ 		stat_tmp.kvm = kvm;
+ 		vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
+ 	}
+ 	mutex_unlock(&kvm_lock);
+ 
+ 	return 0;
+ }
+ 
+ DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
+ 			"%llu\n");
+ 
+ static const struct file_operations *stat_fops[] = {
+ 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
+ 	[KVM_STAT_VM]   = &vm_stat_fops,
+ };
+ 
+ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
+ {
+ 	struct kobj_uevent_env *env;
+ 	unsigned long long created, active;
+ 
+ 	if (!kvm_dev.this_device || !kvm)
+ 		return;
+ 
+ 	mutex_lock(&kvm_lock);
+ 	if (type == KVM_EVENT_CREATE_VM) {
+ 		kvm_createvm_count++;
+ 		kvm_active_vms++;
+ 	} else if (type == KVM_EVENT_DESTROY_VM) {
+ 		kvm_active_vms--;
+ 	}
+ 	created = kvm_createvm_count;
+ 	active = kvm_active_vms;
+ 	mutex_unlock(&kvm_lock);
+ 
+ 	env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
+ 	if (!env)
+ 		return;
+ 
+ 	add_uevent_var(env, "CREATED=%llu", created);
+ 	add_uevent_var(env, "COUNT=%llu", active);
+ 
+ 	if (type == KVM_EVENT_CREATE_VM) {
+ 		add_uevent_var(env, "EVENT=create");
+ 		kvm->userspace_pid = task_pid_nr(current);
+ 	} else if (type == KVM_EVENT_DESTROY_VM) {
+ 		add_uevent_var(env, "EVENT=destroy");
+ 	}
+ 	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
+ 
+ 	if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
+ 		char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
+ 
+ 		if (p) {
+ 			tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
+ 			if (!IS_ERR(tmp))
+ 				add_uevent_var(env, "STATS_PATH=%s", tmp);
+ 			kfree(p);
+ 		}
+ 	}
+ 	/* no need for checks, since we are adding at most only 5 keys */
+ 	env->envp[env->envp_idx++] = NULL;
+ 	kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
+ 	kfree(env);
+ }
+ 
+ static void kvm_init_debug(void)
+ {
+ 	struct kvm_stats_debugfs_item *p;
+ 
+ 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
+ 
+ 	kvm_debugfs_num_entries = 0;
+ 	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
+ 		int mode = p->mode ? p->mode : 0644;
+ 		debugfs_create_file(p->name, mode, kvm_debugfs_dir,
+ 				    (void *)(long)p->offset,
+ 				    stat_fops[p->kind]);
+ 	}
+ }
+ 
+ static int kvm_suspend(void)
+ {
+ 	if (kvm_usage_count)
+ 		hardware_disable_nolock(NULL);
+ 	return 0;
+ }
+ 
+ static void kvm_resume(void)
+ {
+ 	if (kvm_usage_count) {
+ #ifdef CONFIG_LOCKDEP
+ 		WARN_ON(lockdep_is_held(&kvm_count_lock));
+ #endif
+ 		hardware_enable_nolock(NULL);
+ 	}
+ }
+ 
+ static struct syscore_ops kvm_syscore_ops = {
+ 	.suspend = kvm_suspend,
+ 	.resume = kvm_resume,
+ };
+ 
+ static inline
+ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
+ {
+ 	return container_of(pn, struct kvm_vcpu, preempt_notifier);
+ }
+ 
+ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
+ {
+ 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+ 
+ 	WRITE_ONCE(vcpu->preempted, false);
+ 	WRITE_ONCE(vcpu->ready, false);
+ 
+ 	kvm_arch_sched_in(vcpu, cpu);
+ 
+ 	kvm_arch_vcpu_load(vcpu, cpu);
+ }
+ 
+ static void kvm_sched_out(struct preempt_notifier *pn,
+ 			  struct task_struct *next)
+ {
+ 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+ 
+ 	if (current->state == TASK_RUNNING) {
+ 		WRITE_ONCE(vcpu->preempted, true);
+ 		WRITE_ONCE(vcpu->ready, true);
+ 	}
+ 	kvm_arch_vcpu_put(vcpu);
+ }
+ 
+ static void check_processor_compat(void *rtn)
+ {
+ 	*(int *)rtn = kvm_arch_check_processor_compat();
+ }
+ 
+ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ 		  struct module *module)
+ {
+ 	int r;
+ 	int cpu;
+ 
+ 	r = kvm_arch_init(opaque);
+ 	if (r)
+ 		goto out_fail;
+ 
+ 	/*
+ 	 * kvm_arch_init makes sure there's at most one caller
+ 	 * for architectures that support multiple implementations,
+ 	 * like intel and amd on x86.
+ 	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
+ 	 * conflicts in case kvm is already setup for another implementation.
+ 	 */
+ 	r = kvm_irqfd_init();
+ 	if (r)
+ 		goto out_irqfd;
+ 
+ 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+ 		r = -ENOMEM;
+ 		goto out_free_0;
+ 	}
+ 
+ 	r = kvm_arch_hardware_setup();
+ 	if (r < 0)
+ 		goto out_free_0a;
+ 
+ 	for_each_online_cpu(cpu) {
+ 		smp_call_function_single(cpu, check_processor_compat, &r, 1);
+ 		if (r < 0)
+ 			goto out_free_1;
+ 	}
+ 
+ 	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
+ 				      kvm_starting_cpu, kvm_dying_cpu);
+ 	if (r)
+ 		goto out_free_2;
+ 	register_reboot_notifier(&kvm_reboot_notifier);
+ 
+ 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
+ 	if (!vcpu_align)
+ 		vcpu_align = __alignof__(struct kvm_vcpu);
+ 	kvm_vcpu_cache =
+ 		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
+ 					   SLAB_ACCOUNT,
+ 					   offsetof(struct kvm_vcpu, arch),
+ 					   sizeof_field(struct kvm_vcpu, arch),
+ 					   NULL);
+ 	if (!kvm_vcpu_cache) {
+ 		r = -ENOMEM;
+ 		goto out_free_3;
+ 	}
+ 
+ 	r = kvm_async_pf_init();
+ 	if (r)
+ 		goto out_free;
+ 
+ 	kvm_chardev_ops.owner = module;
+ 	kvm_vm_fops.owner = module;
+ 	kvm_vcpu_fops.owner = module;
+ 
+ 	r = misc_register(&kvm_dev);
+ 	if (r) {
+ 		pr_err("kvm: misc device register failed\n");
+ 		goto out_unreg;
+ 	}
+ 
+ 	register_syscore_ops(&kvm_syscore_ops);
+ 
+ 	kvm_preempt_ops.sched_in = kvm_sched_in;
+ 	kvm_preempt_ops.sched_out = kvm_sched_out;
+ 
+ 	kvm_init_debug();
+ 
+ 	r = kvm_vfio_ops_init();
+ 	WARN_ON(r);
+ 
+ 	return 0;
+ 
+ out_unreg:
+ 	kvm_async_pf_deinit();
+ out_free:
+ 	kmem_cache_destroy(kvm_vcpu_cache);
+ out_free_3:
+ 	unregister_reboot_notifier(&kvm_reboot_notifier);
+ 	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
+ out_free_2:
+ out_free_1:
+ 	kvm_arch_hardware_unsetup();
+ out_free_0a:
+ 	free_cpumask_var(cpus_hardware_enabled);
+ out_free_0:
+ 	kvm_irqfd_exit();
+ out_irqfd:
+ 	kvm_arch_exit();
+ out_fail:
+ 	return r;
+ }
+ EXPORT_SYMBOL_GPL(kvm_init);
+ 
+ void kvm_exit(void)
+ {
+ 	debugfs_remove_recursive(kvm_debugfs_dir);
+ 	misc_deregister(&kvm_dev);
+ 	kmem_cache_destroy(kvm_vcpu_cache);
+ 	kvm_async_pf_deinit();
+ 	unregister_syscore_ops(&kvm_syscore_ops);
+ 	unregister_reboot_notifier(&kvm_reboot_notifier);
+ 	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
+ 	on_each_cpu(hardware_disable_nolock, NULL, 1);
+ 	kvm_arch_hardware_unsetup();
+ 	kvm_arch_exit();
+ 	kvm_irqfd_exit();
+ 	free_cpumask_var(cpus_hardware_enabled);
+ 	kvm_vfio_ops_exit();
+ }
+ EXPORT_SYMBOL_GPL(kvm_exit);
+ 
+ struct kvm_vm_worker_thread_context {
+ 	struct kvm *kvm;
+ 	struct task_struct *parent;
+ 	struct completion init_done;
+ 	kvm_vm_thread_fn_t thread_fn;
+ 	uintptr_t data;
+ 	int err;
+ };
+ 
+ static int kvm_vm_worker_thread(void *context)
+ {
+ 	/*
+ 	 * The init_context is allocated on the stack of the parent thread, so
+ 	 * we have to locally copy anything that is needed beyond initialization
+ 	 */
+ 	struct kvm_vm_worker_thread_context *init_context = context;
+ 	struct kvm *kvm = init_context->kvm;
+ 	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+ 	uintptr_t data = init_context->data;
+ 	int err;
+ 
+ 	err = kthread_park(current);
+ 	/* kthread_park(current) is never supposed to return an error */
+ 	WARN_ON(err != 0);
+ 	if (err)
+ 		goto init_complete;
+ 
+ 	err = cgroup_attach_task_all(init_context->parent, current);
+ 	if (err) {
+ 		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+ 			__func__, err);
+ 		goto init_complete;
+ 	}
+ 
+ 	set_user_nice(current, task_nice(init_context->parent));
+ 
+ init_complete:
+ 	init_context->err = err;
+ 	complete(&init_context->init_done);
+ 	init_context = NULL;
+ 
+ 	if (err)
+ 		return err;
+ 
+ 	/* Wait to be woken up by the spawner before proceeding. */
+ 	kthread_parkme();
+ 
+ 	if (!kthread_should_stop())
+ 		err = thread_fn(kvm, data);
+ 
+ 	return err;
+ }
+ 
+ int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+ 				uintptr_t data, const char *name,
+ 				struct task_struct **thread_ptr)
+ {
+ 	struct kvm_vm_worker_thread_context init_context = {};
+ 	struct task_struct *thread;
+ 
+ 	*thread_ptr = NULL;
+ 	init_context.kvm = kvm;
+ 	init_context.parent = current;
+ 	init_context.thread_fn = thread_fn;
+ 	init_context.data = data;
+ 	init_completion(&init_context.init_done);
+ 
+ 	thread = kthread_run(kvm_vm_worker_thread, &init_context,
+ 			     "%s-%d", name, task_pid_nr(current));
+ 	if (IS_ERR(thread))
+ 		return PTR_ERR(thread);
+ 
+ 	/* kthread_run is never supposed to return NULL */
+ 	WARN_ON(thread == NULL);
+ 
+ 	wait_for_completion(&init_context.init_done);
+ 
+ 	if (!init_context.err)
+ 		*thread_ptr = thread;
+ 
+ 	return init_context.err;
+ }
diff --color -rcNP Master/virt/kvm/kvm_main.c.rej OG/virt/kvm/kvm_main.c.rej
*** Master/virt/kvm/kvm_main.c.rej	1969-12-31 19:00:00.000000000 -0500
--- OG/virt/kvm/kvm_main.c.rej	2021-04-20 15:11:27.333000000 -0400
***************
*** 0 ****
--- 1,19 ----
+ *** virt/kvm/kvm_main.c	2021-03-13 19:32:29.000000000 +0200
+ --- virt/kvm/kvm_main.c	2021-03-11 15:06:51.000000000 +0200
+ ***************
+ *** 1642,1648 ****
+   	 * Whoever called remap_pfn_range is also going to call e.g.
+   	 * unmap_mapping_range before the underlying pages are freed,
+   	 * causing a call to our MMU notifier.
+ ! 	 */
+   	kvm_get_pfn(pfn);
+   
+   out:
+ --- 1638,1644 ----
+   	 * Whoever called remap_pfn_range is also going to call e.g.
+   	 * unmap_mapping_range before the underlying pages are freed,
+   	 * causing a call to our MMU notifier.
+ ! 	 */
+   	kvm_get_pfn(pfn);
+   
+   out:
